diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..378a9c45781c71a3f3c763bd9f733122d56980e4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Shared objects +*.so + +# Distribution / packaging +build/ +*.egg-info/ +*.egg + +# Temporary files +*.swn +*.swo +*.swp + +# Dataset symlinks +detectron/datasets/data/* +!detectron/datasets/data/README.md + +# Generated C files +detectron/utils/cython_*.c diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..ff66225c8bd42e019f730040948f53d2bc7752d0 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..e918a3d5d7982626551173c118c3a63245112ec2 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/retina-net-r-main.iml b/.idea/retina-net-r-main.iml new file mode 100644 index 0000000000000000000000000000000000000000..6711606311e2664bd835f92b5c114681d2e284f5 --- /dev/null +++ b/.idea/retina-net-r-main.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000000000000000000000000000000000000..65f2988990c79dfa5fea98380e970fe66dcfb479 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1656041160941 + + + + \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd06f17afda144a125b14f89b90ac2e512d45e38 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,56 @@ +cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR) + +# Find the Caffe2 package. +# Caffe2 exports the required targets, so find_package should work for +# the standard Caffe2 installation. If you encounter problems with finding +# the Caffe2 package, make sure you have run `make install` when installing +# Caffe2 (`make install` populates your share/cmake/Caffe2). +find_package(Caffe2 REQUIRED) + +if (${CAFFE2_VERSION} VERSION_LESS 0.8.2) + # Pre-0.8.2 caffe2 does not have proper interface libraries set up, so we + # will rely on the old path. + message(WARNING + "You are using an older version of Caffe2 (version " ${CAFFE2_VERSION} + "). Please consider moving to a newer version.") + include(cmake/legacy/legacymake.cmake) + return() +endif() + +# Add compiler flags. +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O2 -fPIC -Wno-narrowing") + +# Print configuration summary. +include(cmake/Summary.cmake) +detectron_print_config_summary() + +# Collect custom ops sources. +file(GLOB CUSTOM_OPS_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/detectron/ops/*.cc) +file(GLOB CUSTOM_OPS_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/detectron/ops/*.cu) + +# Install custom CPU ops lib. +add_library( + caffe2_detectron_custom_ops SHARED + ${CUSTOM_OPS_CPU_SRCS}) + +target_include_directories( + caffe2_detectron_custom_ops PRIVATE + ${CAFFE2_INCLUDE_DIRS}) + +target_link_libraries(caffe2_detectron_custom_ops caffe2_library) +install(TARGETS caffe2_detectron_custom_ops DESTINATION lib) + +# Install custom GPU ops lib, if gpu is present. +if (CAFFE2_USE_CUDA OR CAFFE2_FOUND_CUDA) + # Additional -I prefix is required for CMake versions before commit (< 3.7): + # https://github.com/Kitware/CMake/commit/7ded655f7ba82ea72a82d0555449f2df5ef38594 + list(APPEND CUDA_INCLUDE_DIRS -I${CAFFE2_INCLUDE_DIRS}) + CUDA_ADD_LIBRARY( + caffe2_detectron_custom_ops_gpu SHARED + ${CUSTOM_OPS_CPU_SRCS} + ${CUSTOM_OPS_GPU_SRCS}) + + target_link_libraries(caffe2_detectron_custom_ops_gpu caffe2_gpu_library) + install(TARGETS caffe2_detectron_custom_ops_gpu DESTINATION lib) +endif() diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..cd482d89761c2eb60d1fc8c72c1708b7f47b8c82 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..396a58ad3132c035c4e369ce8714949b975d44a8 --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +# Don't use the --user flag for setup.py develop mode with virtualenv. +DEV_USER_FLAG=$(shell python -c "import sys; print('' if hasattr(sys, 'real_prefix') else '--user')") + +.PHONY: default +default: dev + +.PHONY: install +install: + python setup.py install + +.PHONY: ops +ops: + mkdir -p build && cd build && cmake .. && make -j$(shell nproc) + +.PHONY: dev +dev: + python setup.py develop $(DEV_USER_FLAG) + +.PHONY: clean +clean: + python setup.py develop --uninstall $(DEV_USER_FLAG) + rm -rf build diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..5fcc013a09a9df06eed031ef2ef3770c7bdaf848 --- /dev/null +++ b/NOTICE @@ -0,0 +1,29 @@ +Portions of this software are derived from py-faster-rcnn. + +============================================================================== +py-faster-rcnn licence +============================================================================== + +Faster R-CNN + +The MIT License (MIT) + +Copyright (c) 2015 Microsoft Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c00fea5f4e48a12e5d8f9ba74382e665cfc04de7 --- /dev/null +++ b/cmake/Summary.cmake @@ -0,0 +1,36 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Adapted from https://github.com/caffe2/caffe2/blob/master/cmake/Summary.cmake + +# Prints configuration summary. +function (detectron_print_config_summary) + message(STATUS "Summary:") + message(STATUS " CMake version : ${CMAKE_VERSION}") + message(STATUS " CMake command : ${CMAKE_COMMAND}") + message(STATUS " System name : ${CMAKE_SYSTEM_NAME}") + message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") + message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") + message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}") + message(STATUS " Caffe2 version : ${CAFFE2_VERSION}") + message(STATUS " Caffe2 include path : ${CAFFE2_INCLUDE_DIRS}") + if (CAFFE2_USE_CUDA OR CAFFE2_FOUND_CUDA) + message(STATUS " Caffe2 found CUDA : True") + message(STATUS " CUDA version : ${CUDA_VERSION}") + message(STATUS " CuDNN version : ${CUDNN_VERSION}") + else() + message(STATUS " Caffe2 found CUDA : False") + endif() +endfunction() diff --git a/cmake/legacy/Cuda.cmake b/cmake/legacy/Cuda.cmake new file mode 100644 index 0000000000000000000000000000000000000000..af252e9bcb9b77b3bfebd9a25a16615cdca353c7 --- /dev/null +++ b/cmake/legacy/Cuda.cmake @@ -0,0 +1,259 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Copied from https://github.com/caffe2/caffe2/blob/master/cmake/Cuda.cmake + +# Caffe2 cmake utility to prepare for cuda build. +# This cmake file is called from Dependencies.cmake. You do not need to +# manually invoke it. + +# Known NVIDIA GPU achitectures Caffe2 can be compiled for. +# Default is set to cuda 9. If we detect the cuda architectores to be less than +# 9, we will lower it to the corresponding known archs. +set(Caffe2_known_gpu_archs "30 35 50 52 60 61 70") # for CUDA 9.x +set(Caffe2_known_gpu_archs8 "20 21(20) 30 35 50 52 60 61") # for CUDA 8.x +set(Caffe2_known_gpu_archs7 "20 21(20) 30 35 50 52") # for CUDA 7.x + + +################################################################################################ +# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME +# Usage: +# caffe_select_nvcc_arch_flags(out_variable) +function(caffe2_select_nvcc_arch_flags out_variable) + # List of arch names + set(__archs_names "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual") + set(__archs_name_default "All") + + # Set CUDA_ARCH_NAME strings (so it will be seen as dropbox in the CMake GUI) + set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU architecture") + set_property(CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names}) + mark_as_advanced(CUDA_ARCH_NAME) + + # Verify CUDA_ARCH_NAME value + if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};") + string(REPLACE ";" ", " __archs_names "${__archs_names}") + message(FATAL_ERROR "Invalid CUDA_ARCH_NAME, supported values: ${__archs_names}. Got ${CUDA_ARCH_NAME}") + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Manual") + set(CUDA_ARCH_BIN "" CACHE STRING + "Specify GPU architectures to build binaries for (BIN(PTX) format is supported)") + set(CUDA_ARCH_PTX "" CACHE STRING + "Specify GPU architectures to build PTX intermediate code for") + mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) + else() + unset(CUDA_ARCH_BIN CACHE) + unset(CUDA_ARCH_PTX CACHE) + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Kepler") + set(__cuda_arch_bin "30 35") + elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") + set(__cuda_arch_bin "50") + elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") + set(__cuda_arch_bin "60 61") + elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") + set(__cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "All") + set(__cuda_arch_bin ${Caffe2_known_gpu_archs}) + elseif(${CUDA_ARCH_NAME} STREQUAL "Manual") + set(__cuda_arch_bin ${CUDA_ARCH_BIN}) + set(__cuda_arch_ptx ${CUDA_ARCH_PTX}) + else() + message(FATAL_ERROR "Invalid CUDA_ARCH_NAME") + endif() + + # Remove dots and convert to lists + string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}") + string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${__cuda_arch_ptx}") + string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" __cuda_arch_ptx "${__cuda_arch_ptx}") + list(REMOVE_DUPLICATES __cuda_arch_bin) + list(REMOVE_DUPLICATES __cuda_arch_ptx) + + set(__nvcc_flags "") + set(__nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(__arch ${__cuda_arch_bin}) + if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified PTX for the concrete BIN + list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN + list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch}) + list(APPEND __nvcc_archs_readable sm_${__arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(__arch ${__cuda_arch_ptx}) + list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch}) + list(APPEND __nvcc_archs_readable compute_${__arch}) + endforeach() + + string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}") + set(${out_variable} ${__nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE) +endfunction() + + +################################################################################################ +# Short command for cuda compilation +# Usage: +# caffe_cuda_compile( ) +macro(caffe2_cuda_compile objlist_variable) + foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) + set(${var}_backup_in_cuda_compile_ "${${var}}") + + # we remove /EHa as it generates warnings under windows + string(REPLACE "/EHa" "" ${var} "${${var}}") + + endforeach() + + if(APPLE) + list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function) + endif() + + cuda_compile(cuda_objcs ${ARGN}) + + foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) + set(${var} "${${var}_backup_in_cuda_compile_}") + unset(${var}_backup_in_cuda_compile_) + endforeach() + + set(${objlist_variable} ${cuda_objcs}) +endmacro() + +################################################################################################ +### Non macro section +################################################################################################ + +# Special care for windows platform: we know that 32-bit windows does not support cuda. +if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + if(NOT (CMAKE_SIZEOF_VOID_P EQUAL 8)) + message(FATAL_ERROR + "CUDA support not available with 32-bit windows. Did you " + "forget to set Win64 in the generator target?") + return() + endif() +endif() + +find_package(CUDA 7.0 QUIET) +find_cuda_helper_libs(curand) # cmake 2.8.7 compartibility which doesn't search for curand + +if(NOT CUDA_FOUND) + set(HAVE_CUDA FALSE) + return() +endif() + +set(HAVE_CUDA TRUE) +message(STATUS "CUDA detected: " ${CUDA_VERSION}) +if (${CUDA_VERSION} LESS 7.0) + message(FATAL_ERROR "Caffe2 requires CUDA 7.0 or later version") +elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x + set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs7}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") +elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x + set(Caffe2_known_gpu_archs ${Caffe2_known_gpu_archs8}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + # CUDA 8 may complain that sm_20 is no longer supported. Suppress the + # warning for now. + list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") +endif() + +caffe2_include_directories(${CUDA_INCLUDE_DIRS}) +list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${CUDA_CUDART_LIBRARY} + ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) + +# find libcuda.so and lbnvrtc.so +# For libcuda.so, we will find it under lib, lib64, and then the +# stubs folder, in case we are building on a system that does not +# have cuda driver installed. On windows, we also search under the +# folder lib/x64. + +find_library(CUDA_CUDA_LIB cuda + PATHS ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES lib lib64 lib/stubs lib64/stubs lib/x64) +find_library(CUDA_NVRTC_LIB nvrtc + PATHS ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES lib lib64 lib/x64) + +# setting nvcc arch flags +caffe2_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) +message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") + +if(CUDA_CUDA_LIB) + message(STATUS "Found libcuda: ${CUDA_CUDA_LIB}") + list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${CUDA_CUDA_LIB}) +else() + message(FATAL_ERROR "Cannot find libcuda.so. Please file an issue on https://github.com/caffe2/caffe2 with your build output.") +endif() +if(CUDA_NVRTC_LIB) + message(STATUS "Found libnvrtc: ${CUDA_NVRTC_LIB}") + list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${CUDA_NVRTC_LIB}) +else() + message(FATAL_ERROR "Cannot find libnvrtc.so. Please file an issue on https://github.com/caffe2/caffe2 with your build output.") +endif() + +# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc. +foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used) + list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag}) +endforeach() + +# Set C++11 support +set(CUDA_PROPAGATE_HOST_FLAGS OFF) +if (NOT MSVC) + list(APPEND CUDA_NVCC_FLAGS "-std=c++14") + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +endif() + +# Debug and Release symbol support +if (MSVC) + if (${CMAKE_BUILD_TYPE} MATCHES "Release") + if (${BUILD_SHARED_LIBS}) + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MD") + else() + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MT") + endif() + elseif(${CMAKE_BUILD_TYPE} MATCHES "Debug") + message(FATAL_ERROR + "Caffe2 currently does not support the combination of MSVC, Cuda " + "and Debug mode. Either set USE_CUDA=OFF or set the build type " + "to Release") + if (${BUILD_SHARED_LIBS}) + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MDd") + else() + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -MTd") + endif() + else() + message(FATAL_ERROR "Unknown cmake build type: " ${CMAKE_BUILD_TYPE}) + endif() +endif() + + +if(OpenMP_FOUND) + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler ${OpenMP_CXX_FLAGS}") +endif() + +# Set :expt-relaxed-constexpr to suppress Eigen warnings +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") + +mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) +mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/legacy/Dependencies.cmake b/cmake/legacy/Dependencies.cmake new file mode 100644 index 0000000000000000000000000000000000000000..5cb9dd8e391e78c89baea0a6d702b5ce61bac4f2 --- /dev/null +++ b/cmake/legacy/Dependencies.cmake @@ -0,0 +1,51 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Adapted from https://github.com/caffe2/caffe2/blob/master/cmake/Dependencies.cmake + +# Find CUDA. +include(cmake/legacy/Cuda.cmake) +if (HAVE_CUDA) + # CUDA 9.x requires GCC version <= 6 + if ((CUDA_VERSION VERSION_EQUAL 9.0) OR + (CUDA_VERSION VERSION_GREATER 9.0 AND CUDA_VERSION VERSION_LESS 10.0)) + if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND + NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 7.0 AND + CUDA_HOST_COMPILER STREQUAL CMAKE_C_COMPILER) + message(FATAL_ERROR + "CUDA ${CUDA_VERSION} is not compatible with GCC version >= 7. " + "Use the following option to use another version (for example): \n" + " -DCUDA_HOST_COMPILER=/usr/bin/gcc-6\n") + endif() + # CUDA 8.0 requires GCC version <= 5 + elseif (CUDA_VERSION VERSION_EQUAL 8.0) + if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND + NOT CMAKE_C_COMPILER_VERSION VERSION_LESS 6.0 AND + CUDA_HOST_COMPILER STREQUAL CMAKE_C_COMPILER) + message(FATAL_ERROR + "CUDA 8.0 is not compatible with GCC version >= 6. " + "Use the following option to use another version (for example): \n" + " -DCUDA_HOST_COMPILER=/usr/bin/gcc-5\n") + endif() + endif() +endif() + +# Find CUDNN. +if (HAVE_CUDA) + find_package(CuDNN REQUIRED) + if (CUDNN_FOUND) + caffe2_include_directories(${CUDNN_INCLUDE_DIRS}) + endif() +endif() diff --git a/cmake/legacy/Modules/FindCuDNN.cmake b/cmake/legacy/Modules/FindCuDNN.cmake new file mode 100644 index 0000000000000000000000000000000000000000..0b7ea943f907e6caf176d00c3f49ab89be3c04de --- /dev/null +++ b/cmake/legacy/Modules/FindCuDNN.cmake @@ -0,0 +1,70 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Copied from https://github.com/caffe2/caffe2/blob/master/cmake/Modules/FindCuDNN.cmake + +# - Try to find cuDNN +# +# The following variables are optionally searched for defaults +# CUDNN_ROOT_DIR: Base directory where all cuDNN components are found +# +# The following are set after configuration is done: +# CUDNN_FOUND +# CUDNN_INCLUDE_DIRS +# CUDNN_LIBRARIES +# CUDNN_LIBRARY_DIRS + +include(FindPackageHandleStandardArgs) + +set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN") + +find_path(CUDNN_INCLUDE_DIR cudnn.h + HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES cuda/include include) + +find_library(CUDNN_LIBRARY cudnn + HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64) + +find_package_handle_standard_args( + CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARY) + +if(CUDNN_FOUND) + # get cuDNN version + file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_HEADER_CONTENTS) + string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" + CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}") + string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" + CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}") + string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)" + CUDNN_VERSION_MINOR "${CUDNN_HEADER_CONTENTS}") + string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1" + CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}") + string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" + CUDNN_VERSION_PATCH "${CUDNN_HEADER_CONTENTS}") + string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" + CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}") + # Assemble cuDNN version + if(NOT CUDNN_VERSION_MAJOR) + set(CUDNN_VERSION "?") + else() + set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}") + endif() + + set(CUDNN_INCLUDE_DIRS ${CUDNN_INCLUDE_DIR}) + set(CUDNN_LIBRARIES ${CUDNN_LIBRARY}) + message(STATUS "Found cuDNN: v${CUDNN_VERSION} (include: ${CUDNN_INCLUDE_DIR}, library: ${CUDNN_LIBRARY})") + mark_as_advanced(CUDNN_ROOT_DIR CUDNN_LIBRARY CUDNN_INCLUDE_DIR) +endif() diff --git a/cmake/legacy/Summary.cmake b/cmake/legacy/Summary.cmake new file mode 100644 index 0000000000000000000000000000000000000000..531377cfd5f4ffd524850e36872827bbdddf9a14 --- /dev/null +++ b/cmake/legacy/Summary.cmake @@ -0,0 +1,34 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Adapted from https://github.com/caffe2/caffe2/blob/master/cmake/Summary.cmake + +# Prints configuration summary. +function (detectron_print_config_summary) + message(STATUS "Summary:") + message(STATUS " CMake version : ${CMAKE_VERSION}") + message(STATUS " CMake command : ${CMAKE_COMMAND}") + message(STATUS " System name : ${CMAKE_SYSTEM_NAME}") + message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") + message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") + message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}") + message(STATUS " Caffe2 version : ${CAFFE2_VERSION}") + message(STATUS " Caffe2 include path : ${CAFFE2_INCLUDE_DIRS}") + message(STATUS " Have CUDA : ${HAVE_CUDA}") + if (${HAVE_CUDA}) + message(STATUS " CUDA version : ${CUDA_VERSION}") + message(STATUS " CuDNN version : ${CUDNN_VERSION}") + endif() +endfunction() diff --git a/cmake/legacy/Utils.cmake b/cmake/legacy/Utils.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e613650cc665aed9eb003f1965f363c5d915ae8f --- /dev/null +++ b/cmake/legacy/Utils.cmake @@ -0,0 +1,290 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Copied from https://github.com/caffe2/caffe2/blob/master/cmake/Utils.cmake + +################################################################################################ +# Exclude and prepend functionalities +function (exclude OUTPUT INPUT) +set(EXCLUDES ${ARGN}) +foreach(EXCLUDE ${EXCLUDES}) + list(REMOVE_ITEM INPUT "${EXCLUDE}") +endforeach() +set(${OUTPUT} ${INPUT} PARENT_SCOPE) +endfunction(exclude) + +function (prepend OUTPUT PREPEND) +set(OUT "") +foreach(ITEM ${ARGN}) + list(APPEND OUT "${PREPEND}${ITEM}") +endforeach() +set(${OUTPUT} ${OUT} PARENT_SCOPE) +endfunction(prepend) + + +################################################################################################ +# Clears variables from list +# Usage: +# caffe_clear_vars() +macro(caffe_clear_vars) + foreach(_var ${ARGN}) + unset(${_var}) + endforeach() +endmacro() + +################################################################################################ +# Prints list element per line +# Usage: +# caffe_print_list() +function(caffe_print_list) + foreach(e ${ARGN}) + message(STATUS ${e}) + endforeach() +endfunction() + +################################################################################################ +# Reads set of version defines from the header file +# Usage: +# caffe_parse_header( ..) +macro(caffe_parse_header FILENAME FILE_VAR) + set(vars_regex "") + set(__parnet_scope OFF) + set(__add_cache OFF) + foreach(name ${ARGN}) + if("${name}" STREQUAL "PARENT_SCOPE") + set(__parnet_scope ON) + elseif("${name}" STREQUAL "CACHE") + set(__add_cache ON) + elseif(vars_regex) + set(vars_regex "${vars_regex}|${name}") + else() + set(vars_regex "${name}") + endif() + endforeach() + if(EXISTS "${FILENAME}") + file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" ) + else() + unset(${FILE_VAR}) + endif() + foreach(name ${ARGN}) + if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE") + if(${FILE_VAR}) + if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*") + string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}") + else() + set(${name} "") + endif() + if(__add_cache) + set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE) + elseif(__parnet_scope) + set(${name} "${${name}}" PARENT_SCOPE) + endif() + else() + unset(${name} CACHE) + endif() + endif() + endforeach() +endmacro() + +################################################################################################ +# Reads single version define from the header file and parses it +# Usage: +# caffe_parse_header_single_define( ) +function(caffe_parse_header_single_define LIBNAME HDR_PATH VARNAME) + set(${LIBNAME}_H "") + if(EXISTS "${HDR_PATH}") + file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1) + endif() + + if(${LIBNAME}_H) + string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}") + string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR "${${LIBNAME}_H}") + string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}") + set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE) + set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE) + set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE) + set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE) + + # append a TWEAK version if it exists: + set(${LIBNAME}_VERSION_TWEAK "") + if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$") + set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE) + endif() + if(${LIBNAME}_VERSION_TWEAK) + set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE) + else() + set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE) + endif() + endif() +endfunction() + +######################################################################################################## +# An option that the user can select. Can accept condition to control when option is available for user. +# Usage: +# caffe_option( "doc string" [IF ]) +function(caffe_option variable description value) + set(__value ${value}) + set(__condition "") + set(__varname "__value") + foreach(arg ${ARGN}) + if(arg STREQUAL "IF" OR arg STREQUAL "if") + set(__varname "__condition") + else() + list(APPEND ${__varname} ${arg}) + endif() + endforeach() + unset(__varname) + if("${__condition}" STREQUAL "") + set(__condition 2 GREATER 1) + endif() + + if(${__condition}) + if("${__value}" MATCHES ";") + if(${__value}) + option(${variable} "${description}" ON) + else() + option(${variable} "${description}" OFF) + endif() + elseif(DEFINED ${__value}) + if(${__value}) + option(${variable} "${description}" ON) + else() + option(${variable} "${description}" OFF) + endif() + else() + option(${variable} "${description}" ${__value}) + endif() + else() + unset(${variable} CACHE) + endif() +endfunction() + +############################################################################## +# Helper function to add as-needed flag around a library. +function(caffe_add_as_needed_flag lib output_var) + if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + # TODO: Clang seems to not need this flag. Double check. + set(${output_var} ${lib} PARENT_SCOPE) + elseif(MSVC) + # TODO: check what is the behavior of MSVC. + # In MSVC, we will add whole archive in default. + set(${output_var} ${lib} PARENT_SCOPE) + else() + # Assume everything else is like gcc: we will need as-needed flag. + set(${output_var} -Wl,--no-as-needed ${lib} -Wl,--as-needed PARENT_SCOPE) + endif() +endfunction() + +############################################################################## +# Helper function to add whole_archive flag around a library. +function(caffe_add_whole_archive_flag lib output_var) + if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set(${output_var} -Wl,-force_load,$ PARENT_SCOPE) + elseif(MSVC) + # In MSVC, we will add whole archive in default. + set(${output_var} -WHOLEARCHIVE:$ PARENT_SCOPE) + else() + # Assume everything else is like gcc + set(${output_var} -Wl,--whole-archive ${lib} -Wl,--no-whole-archive PARENT_SCOPE) + endif() +endfunction() + +############################################################################## +# Helper function to add either as-needed, or whole_archive flag around a library. +function(caffe_add_linker_flag lib output_var) + if (BUILD_SHARED_LIBS) + caffe_add_as_needed_flag(${lib} tmp) + else() + caffe_add_whole_archive_flag(${lib} tmp) + endif() + set(${output_var} ${tmp} PARENT_SCOPE) +endfunction() + +############################################################################## +# Helper function to automatically generate __init__.py files where python +# sources reside but there are no __init__.py present. +function(caffe_autogen_init_py_files) + file(GLOB_RECURSE all_python_files RELATIVE ${PROJECT_SOURCE_DIR} + "${PROJECT_SOURCE_DIR}/caffe2/*.py") + set(python_paths_need_init_py) + foreach(python_file ${all_python_files}) + get_filename_component(python_path ${python_file} PATH) + string(REPLACE "/" ";" path_parts ${python_path}) + set(rebuilt_path ${CMAKE_BINARY_DIR}) + foreach(path_part ${path_parts}) + set(rebuilt_path "${rebuilt_path}/${path_part}") + list(APPEND python_paths_need_init_py ${rebuilt_path}) + endforeach() + endforeach() + list(REMOVE_DUPLICATES python_paths_need_init_py) + # Since the _pb2.py files are yet to be created, we will need to manually + # add them to the list. + list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe) + list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe/proto) + list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe2/proto) + + foreach(tmp ${python_paths_need_init_py}) + if(NOT EXISTS ${tmp}/__init__.py) + # message(STATUS "Generate " ${tmp}/__init__.py) + file(WRITE ${tmp}/__init__.py "") + endif() + endforeach() +endfunction() + +############################################################################## +# Creating a Caffe2 binary target with sources specified with relative path. +# Usage: +# caffe2_binary_target(target_name_or_src [] [] ...) +# If only target_name_or_src is specified, this target is build with one single +# source file and the target name is autogen from the filename. Otherwise, the +# target name is given by the first argument and the rest are the source files +# to build the target. +function(caffe2_binary_target target_name_or_src) + if (${ARGN}) + set(__target ${target_name_or_src}) + prepend(__srcs "${CMAKE_CURRENT_SOURCE_DIR}/" "${ARGN}") + else() + get_filename_component(__target ${target_name_or_src} NAME_WE) + prepend(__srcs "${CMAKE_CURRENT_SOURCE_DIR}/" "${target_name_or_src}") + endif() + add_executable(${__target} ${__srcs}) + add_dependencies(${__target} ${Caffe2_MAIN_LIBS_ORDER}) + target_link_libraries(${__target} ${Caffe2_MAIN_LIBS} ${Caffe2_DEPENDENCY_LIBS}) + install(TARGETS ${__target} DESTINATION bin) +endfunction() + +############################################################################## +# Helper function to add paths to system include directories. +# +# Anaconda distributions typically contain a lot of packages and some +# of those can conflict with headers/libraries that must be sourced +# from elsewhere. This helper ensures that Anaconda paths are always +# added AFTER other include paths, such that it does not accidentally +# takes precedence when it shouldn't. +# +# This is just a heuristic and does not have any guarantees. We can +# add other corner cases here (as long as they are generic enough). +# A complete include path cross checker is a final resort if this +# hacky approach proves insufficient. +# +function(caffe2_include_directories) + foreach(path IN LISTS ARGN) + if (${path} MATCHES "/anaconda") + include_directories(AFTER SYSTEM ${path}) + else() + include_directories(BEFORE SYSTEM ${path}) + endif() + endforeach() +endfunction() diff --git a/cmake/legacy/legacymake.cmake b/cmake/legacy/legacymake.cmake new file mode 100644 index 0000000000000000000000000000000000000000..82ffcf12accea893bd029c679ea74de0399d441b --- /dev/null +++ b/cmake/legacy/legacymake.cmake @@ -0,0 +1,63 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# This file contains legacy cmake scripts that is going to be removed +# in a future release. + +# Add CMake modules. +list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/legacy/Modules) + +# Add compiler flags. +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O2 -fPIC -Wno-narrowing") + +# Include Caffe2 CMake utils. +include(cmake/legacy/Utils.cmake) + +# Find dependencies. +include(cmake/legacy/Dependencies.cmake) + +# Print configuration summary. +include(cmake/legacy/Summary.cmake) +detectron_print_config_summary() + +# Collect custom ops sources. +file(GLOB CUSTOM_OPS_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/detectron/ops/*.cc) +file(GLOB CUSTOM_OPS_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/detectron/ops/*.cu) + +# Install custom CPU ops lib. +add_library( + caffe2_detectron_custom_ops SHARED + ${CUSTOM_OPS_CPU_SRCS}) + +target_include_directories( + caffe2_detectron_custom_ops PRIVATE + ${CAFFE2_INCLUDE_DIRS}) +target_link_libraries(caffe2_detectron_custom_ops caffe2) +install(TARGETS caffe2_detectron_custom_ops DESTINATION lib) + +# Install custom GPU ops lib. +if (${HAVE_CUDA}) + # Additional -I prefix is required for CMake versions before commit (< 3.7): + # https://github.com/Kitware/CMake/commit/7ded655f7ba82ea72a82d0555449f2df5ef38594 + list(APPEND CUDA_INCLUDE_DIRS -I${CAFFE2_INCLUDE_DIRS}) + CUDA_ADD_LIBRARY( + caffe2_detectron_custom_ops_gpu SHARED + ${CUSTOM_OPS_CPU_SRCS} + ${CUSTOM_OPS_GPU_SRCS}) + + target_link_libraries(caffe2_detectron_custom_ops_gpu caffe2_gpu) + install(TARGETS caffe2_detectron_custom_ops_gpu DESTINATION lib) +endif() diff --git a/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-101-FPN_2x_gn.yaml b/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-101-FPN_2x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c8c96968d903381c24da76c04b170fb4684e6a6 --- /dev/null +++ b/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-101-FPN_2x_gn.yaml @@ -0,0 +1,52 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True + USE_GN: True # Note: use GN on the FPN-specific layers +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_gn_transformation # Note: this is a GN bottleneck transform + STEM_FUNC: basic_gn_stem # Note: this is a GN stem + SHORTCUT_FUNC: basic_gn_shortcut # Note: this is a GN shortcut +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_Xconv1fc_gn_head # Note: this is a Conv GN head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs_gn # Note: this is a GN mask head + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47592356/R-101-GN.pkl # Note: a GN pre-trained model + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-101-FPN_3x_gn.yaml b/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-101-FPN_3x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe463daac800088faaeb19a8acb4942ce39fea75 --- /dev/null +++ b/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-101-FPN_3x_gn.yaml @@ -0,0 +1,52 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 270000 + STEPS: [0, 210000, 250000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True + USE_GN: True # Note: use GN on the FPN-specific layers +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_gn_transformation # Note: this is a GN bottleneck transform + STEM_FUNC: basic_gn_stem # Note: this is a GN stem + SHORTCUT_FUNC: basic_gn_shortcut # Note: this is a GN shortcut +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_Xconv1fc_gn_head # Note: this is a Conv GN head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs_gn # Note: this is a GN mask head + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47592356/R-101-GN.pkl # Note: a GN pre-trained model + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn.yaml b/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eecae452650b6045cf05cb10eb89647f14daca13 --- /dev/null +++ b/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn.yaml @@ -0,0 +1,52 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True + USE_GN: True # Note: use GN on the FPN-specific layers +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_gn_transformation # Note: this is a GN bottleneck transform + STEM_FUNC: basic_gn_stem # Note: this is a GN stem + SHORTCUT_FUNC: basic_gn_shortcut # Note: this is a GN shortcut +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_Xconv1fc_gn_head # Note: this is a Conv GN head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs_gn # Note: this is a GN mask head + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47261647/R-50-GN.pkl # Note: a GN pre-trained model + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_3x_gn.yaml b/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_3x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94950ff42ee7f58f9d65c6f25843eff7d291dcfc --- /dev/null +++ b/configs/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_3x_gn.yaml @@ -0,0 +1,52 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 270000 + STEPS: [0, 210000, 250000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True + USE_GN: True # Note: use GN on the FPN-specific layers +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_gn_transformation # Note: this is a GN bottleneck transform + STEM_FUNC: basic_gn_stem # Note: this is a GN stem + SHORTCUT_FUNC: basic_gn_shortcut # Note: this is a GN shortcut +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_Xconv1fc_gn_head # Note: this is a Conv GN head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs_gn # Note: this is a GN mask head + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47261647/R-50-GN.pkl # Note: a GN pre-trained model + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/04_2018_gn_baselines/mask_rcnn_R-50-FPN_1x_gn.yaml b/configs/04_2018_gn_baselines/mask_rcnn_R-50-FPN_1x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3543bf24fe2412fb6f420162cfd1fe81474556dd --- /dev/null +++ b/configs/04_2018_gn_baselines/mask_rcnn_R-50-FPN_1x_gn.yaml @@ -0,0 +1,52 @@ +# WARNING: this script uses **pre-computed** BN-based proposals, and is for quick debugging only. +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True + USE_GN: True # Note: use GN on the FPN-specific layers +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_gn_transformation # Note: this is a GN bottleneck transform + STEM_FUNC: basic_gn_stem # Note: this is a GN stem + SHORTCUT_FUNC: basic_gn_shortcut # Note: this is a GN shortcut +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_Xconv1fc_gn_head # Note: this is a Conv GN head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs_gn # Note: this is a GN mask head + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47261647/R-50-GN.pkl # Note: a GN pre-trained model + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/04_2018_gn_baselines/scratch_e2e_mask_rcnn_R-101-FPN_3x_gn.yaml b/configs/04_2018_gn_baselines/scratch_e2e_mask_rcnn_R-101-FPN_3x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c30a0c2cafc2b0762496788f050ad8a3a84b2d8d --- /dev/null +++ b/configs/04_2018_gn_baselines/scratch_e2e_mask_rcnn_R-101-FPN_3x_gn.yaml @@ -0,0 +1,53 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 270000 + STEPS: [0, 210000, 250000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True + USE_GN: True # Note: use GN on the FPN-specific layers +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_gn_transformation # Note: this is a GN bottleneck transform + STEM_FUNC: basic_gn_stem # Note: this is a GN stem + SHORTCUT_FUNC: basic_gn_shortcut # Note: this is a GN shortcut +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_Xconv1fc_gn_head # Note: this is a Conv GN head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs_gn # Note: this is a GN mask head + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + # WEIGHTS: N/A + FREEZE_AT: 0 + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/04_2018_gn_baselines/scratch_e2e_mask_rcnn_R-50-FPN_3x_gn.yaml b/configs/04_2018_gn_baselines/scratch_e2e_mask_rcnn_R-50-FPN_3x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1245f3ab6a2713831d0ef1f4dab4f1efb441e102 --- /dev/null +++ b/configs/04_2018_gn_baselines/scratch_e2e_mask_rcnn_R-50-FPN_3x_gn.yaml @@ -0,0 +1,53 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 270000 + STEPS: [0, 210000, 250000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True + USE_GN: True # Note: use GN on the FPN-specific layers +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_gn_transformation # Note: this is a GN bottleneck transform + STEM_FUNC: basic_gn_stem # Note: this is a GN stem + SHORTCUT_FUNC: basic_gn_shortcut # Note: this is a GN shortcut +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_Xconv1fc_gn_head # Note: this is a Conv GN head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs_gn # Note: this is a GN mask head + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + # WEIGHTS: N/A + FREEZE_AT: 0 + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4f17fa37eb2883611d04678cd40f8ad7613dd4da --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml @@ -0,0 +1,37 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_2x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..42f9f5dcceb99521e8d94174cbd42b31eab11d9b --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_2x.yaml @@ -0,0 +1,37 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b5c12234e09a66ca61b7b824bc3bb361c324c51 --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml @@ -0,0 +1,34 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +RPN: + SIZES: (32, 64, 128, 256, 512) +FAST_RCNN: + ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head + ROI_XFORM_METHOD: RoIAlign +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 6000 + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_R-50-C4_2x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_R-50-C4_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44686a99ec328c0d97dabb847a5d2c03e0f1c3df --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_R-50-C4_2x.yaml @@ -0,0 +1,34 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +RPN: + SIZES: (32, 64, 128, 256, 512) +FAST_RCNN: + ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head + ROI_XFORM_METHOD: RoIAlign +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 6000 + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f9bb4afbe50ee06d4693bb2cfa96750a43fd90d --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml @@ -0,0 +1,37 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_2x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..457a3ec44722ad4e41f69f588e8e4d291a90599c --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_2x.yaml @@ -0,0 +1,37 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e87e4df571a12bc50855b0ea033cf05939ef5e6b --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,44 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_2x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c8d4e0d5ff88c27a6d3910d9b9b74de348b2b96 --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_2x.yaml @@ -0,0 +1,44 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec91bfda2e20bc659bd4b993e87eac45d8e51d4b --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,44 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_faster_rcnn_X-101-64x4d-FPN_2x.yaml b/configs/12_2017_baselines/e2e_faster_rcnn_X-101-64x4d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea875df0097453d852e0105ba9078a0db7e3e440 --- /dev/null +++ b/configs/12_2017_baselines/e2e_faster_rcnn_X-101-64x4d-FPN_2x.yaml @@ -0,0 +1,45 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_keypoint_rcnn_R-101-FPN_1x.yaml b/configs/12_2017_baselines/e2e_keypoint_rcnn_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db1691194bc58085a959efa0b487b483401ae528 --- /dev/null +++ b/configs/12_2017_baselines/e2e_keypoint_rcnn_R-101-FPN_1x.yaml @@ -0,0 +1,51 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + FASTER_RCNN: True + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_keypoint_rcnn_R-101-FPN_s1x.yaml b/configs/12_2017_baselines/e2e_keypoint_rcnn_R-101-FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..77d20ffb44f97fc60b4e26d247c782fb7cbab3b5 --- /dev/null +++ b/configs/12_2017_baselines/e2e_keypoint_rcnn_R-101-FPN_s1x.yaml @@ -0,0 +1,51 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + FASTER_RCNN: True + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 130000 + STEPS: [0, 100000, 120000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml b/configs/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9bb94160c1067abb712d17467f2931997fec82e2 --- /dev/null +++ b/configs/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml @@ -0,0 +1,51 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 2 + FASTER_RCNN: True + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_s1x.yaml b/configs/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e93324fb24459ef7f61f5489a9e5520e59fe0ee4 --- /dev/null +++ b/configs/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_s1x.yaml @@ -0,0 +1,51 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 2 + FASTER_RCNN: True + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 130000 + STEPS: [0, 100000, 120000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8deaf6aaa8611e21a6861d2efb972d378e223005 --- /dev/null +++ b/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,56 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + FASTER_RCNN: True + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-32x8d-FPN_s1x.yaml b/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-32x8d-FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0dd2a9cd8d325c1612c0e9f4ea6a0968ef9f8eb9 --- /dev/null +++ b/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-32x8d-FPN_s1x.yaml @@ -0,0 +1,56 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + FASTER_RCNN: True + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 130000 + STEPS: [0, 100000, 120000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b710030f699104e1064c1eee0f369dd518451901 --- /dev/null +++ b/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,57 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + FASTER_RCNN: True + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-64x4d-FPN_s1x.yaml b/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-64x4d-FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7030db7933df5aff0c94e0c82d2eeab5ef3f32e1 --- /dev/null +++ b/configs/12_2017_baselines/e2e_keypoint_rcnn_X-101-64x4d-FPN_s1x.yaml @@ -0,0 +1,57 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + FASTER_RCNN: True + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 130000 + STEPS: [0, 100000, 120000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..43a0924f68bcac69c26800e066e194dcde047288 --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml @@ -0,0 +1,46 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_2x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..002d3ac18422b247828987a9728a7ba18b29d4b3 --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_2x.yaml @@ -0,0 +1,46 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2bee8bd501f0faa9d4ea0f8b81f92077874e7ca7 --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml @@ -0,0 +1,42 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +RPN: + SIZES: (32, 64, 128, 256, 512) +FAST_RCNN: + ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head + ROI_XFORM_METHOD: RoIAlign +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v0upshare + RESOLUTION: 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default: GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 6000 + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_R-50-C4_2x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_R-50-C4_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7dacafed2e53e429ec720b91ef8b081e672e6bcf --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_R-50-C4_2x.yaml @@ -0,0 +1,42 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +RPN: + SIZES: (32, 64, 128, 256, 512) +FAST_RCNN: + ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head + ROI_XFORM_METHOD: RoIAlign +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v0upshare + RESOLUTION: 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default: GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 6000 + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9798483ef7a125e9aa8fb572363c3f252dcd8a3c --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml @@ -0,0 +1,46 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_2x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0568c306f9891b2ca945781b9a0f689fd1657395 --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_2x.yaml @@ -0,0 +1,46 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4276e9e8c78decd5b2bb4c8c7a27314c090db530 --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,53 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_2x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a55962c0bc8a8fdfe54cf0b60e5d4e096ad1ed6f --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_2x.yaml @@ -0,0 +1,53 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fcfcfdd72785906c72bb5331012281416bfc9354 --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,54 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_X-101-64x4d-FPN_2x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_X-101-64x4d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..277ede98f442c07f72daeb0cb0438cd781365bce --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_X-101-64x4d-FPN_2x.yaml @@ -0,0 +1,54 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x.yaml b/configs/12_2017_baselines/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d540a299dacddf27d04feac6a4090435167ef1e4 --- /dev/null +++ b/configs/12_2017_baselines/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x.yaml @@ -0,0 +1,77 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet152_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1.44x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 260000 + STEPS: [0, 200000, 240000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (640, 672, 704, 736, 768, 800) # Scale jitter + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + BBOX_VOTE: + ENABLED: True + VOTE_TH: 0.9 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 + BBOX_AUG: + ENABLED: True + SCORE_HEUR: UNION + COORD_HEUR: UNION + H_FLIP: True + SCALES: (400, 500, 600, 700, 900, 1000, 1100, 1200) + MAX_SIZE: 2000 + SCALE_H_FLIP: True + SCALE_SIZE_DEP: False + ASPECT_RATIOS: () + ASPECT_RATIO_H_FLIP: False + MASK_AUG: + ENABLED: True + HEUR: SOFT_AVG + H_FLIP: True + SCALES: (400, 500, 600, 700, 900, 1000, 1100, 1200) + MAX_SIZE: 2000 + SCALE_H_FLIP: True + SCALE_SIZE_DEP: False + ASPECT_RATIOS: () + ASPECT_RATIO_H_FLIP: False +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_R-101-FPN_1x.yaml b/configs/12_2017_baselines/fast_rcnn_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c11631203adef3bd63005b42992b9e67767566cd --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_R-101-FPN_1x.yaml @@ -0,0 +1,36 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_R-101-FPN_2x.yaml b/configs/12_2017_baselines/fast_rcnn_R-101-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f7c5b3ace1e6d194b3db318221bdd71cdfe7f3cc --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_R-101-FPN_2x.yaml @@ -0,0 +1,36 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_R-50-C4_1x.yaml b/configs/12_2017_baselines/fast_rcnn_R-50-C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2c3287917105b0c5f3d5dec5f63fa5240648108 --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_R-50-C4_1x.yaml @@ -0,0 +1,34 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +RPN: + SIZES: (32, 64, 128, 256, 512) +FAST_RCNN: + ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head + ROI_XFORM_METHOD: RoIAlign +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_train/rpn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_valminusminival/rpn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_minival/rpn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_R-50-C4_2x.yaml b/configs/12_2017_baselines/fast_rcnn_R-50-C4_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..71313fa380e2c0f5ca651806f5ad9ffaca58d145 --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_R-50-C4_2x.yaml @@ -0,0 +1,34 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +RPN: + SIZES: (32, 64, 128, 256, 512) +FAST_RCNN: + ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head + ROI_XFORM_METHOD: RoIAlign +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_train/rpn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_valminusminival/rpn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_minival/rpn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml b/configs/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..baa053cc140e3478a9b1e06108669c8abe11161c --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml @@ -0,0 +1,36 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_R-50-FPN_2x.yaml b/configs/12_2017_baselines/fast_rcnn_R-50-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aee5481c6dd38daeb3d2089f822a183e4a04c3e5 --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_R-50-FPN_2x.yaml @@ -0,0 +1,36 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/fast_rcnn_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b65d35f69d021333a9b82d02d36fae2147896f00 --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,43 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_X-101-32x8d-FPN_2x.yaml b/configs/12_2017_baselines/fast_rcnn_X-101-32x8d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a129b5715caf6e7c45d15d9cb5736a3afe24ab8 --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_X-101-32x8d-FPN_2x.yaml @@ -0,0 +1,43 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/fast_rcnn_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e28806d3d8697fc84e28fc08135d57590379f397 --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,43 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/fast_rcnn_X-101-64x4d-FPN_2x.yaml b/configs/12_2017_baselines/fast_rcnn_X-101-64x4d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af79f2bb0a0c5b9e67e2ba122b11b9cf3abd6ec1 --- /dev/null +++ b/configs/12_2017_baselines/fast_rcnn_X-101-64x4d-FPN_2x.yaml @@ -0,0 +1,43 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/keypoint_rcnn_R-101-FPN_1x.yaml b/configs/12_2017_baselines/keypoint_rcnn_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f37de735c1386e91d92e0f9928be5211c40f783a --- /dev/null +++ b/configs/12_2017_baselines/keypoint_rcnn_R-101-FPN_1x.yaml @@ -0,0 +1,50 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35999521/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml.08_20_33.1OkqMmqP/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35999521/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml.08_20_33.1OkqMmqP/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35999521/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml.08_20_33.1OkqMmqP/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/keypoint_rcnn_R-101-FPN_s1x.yaml b/configs/12_2017_baselines/keypoint_rcnn_R-101-FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a13e45a2c1b54d9f6cf6f3789d9110fadad02172 --- /dev/null +++ b/configs/12_2017_baselines/keypoint_rcnn_R-101-FPN_s1x.yaml @@ -0,0 +1,50 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 130000 + STEPS: [0, 100000, 120000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35999521/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml.08_20_33.1OkqMmqP/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35999521/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml.08_20_33.1OkqMmqP/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35999521/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml.08_20_33.1OkqMmqP/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/keypoint_rcnn_R-50-FPN_1x.yaml b/configs/12_2017_baselines/keypoint_rcnn_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe3d222d0a6eac530ecaae614d0d0274a77bb758 --- /dev/null +++ b/configs/12_2017_baselines/keypoint_rcnn_R-50-FPN_1x.yaml @@ -0,0 +1,50 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/keypoint_rcnn_R-50-FPN_s1x.yaml b/configs/12_2017_baselines/keypoint_rcnn_R-50-FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..542d082d70138b963569089a991aed21870304f7 --- /dev/null +++ b/configs/12_2017_baselines/keypoint_rcnn_R-50-FPN_s1x.yaml @@ -0,0 +1,50 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 130000 + STEPS: [0, 100000, 120000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/keypoint_rcnn_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/keypoint_rcnn_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fd4ca5d1fc36da65359033785818f410948d4b7f --- /dev/null +++ b/configs/12_2017_baselines/keypoint_rcnn_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,55 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760438/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml.06_04_23.M2oJlDPW/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/36760438/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml.06_04_23.M2oJlDPW/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760438/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml.06_04_23.M2oJlDPW/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/keypoint_rcnn_X-101-32x8d-FPN_s1x.yaml b/configs/12_2017_baselines/keypoint_rcnn_X-101-32x8d-FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7841f0b21e8d3e8e90b02272436b99cf29879332 --- /dev/null +++ b/configs/12_2017_baselines/keypoint_rcnn_X-101-32x8d-FPN_s1x.yaml @@ -0,0 +1,55 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 130000 + STEPS: [0, 100000, 120000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760438/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml.06_04_23.M2oJlDPW/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/36760438/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml.06_04_23.M2oJlDPW/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760438/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml.06_04_23.M2oJlDPW/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/keypoint_rcnn_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/keypoint_rcnn_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..137248a384562c2c0edcbe835ed8f854bd884bd9 --- /dev/null +++ b/configs/12_2017_baselines/keypoint_rcnn_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,56 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35999553/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml.08_21_33.ghFzzArr/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35999553/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml.08_21_33.ghFzzArr/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35999553/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml.08_21_33.ghFzzArr/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/keypoint_rcnn_X-101-64x4d-FPN_s1x.yaml b/configs/12_2017_baselines/keypoint_rcnn_X-101-64x4d-FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d39633b8a9f0d4bfeba8910e5efe231cb923584 --- /dev/null +++ b/configs/12_2017_baselines/keypoint_rcnn_X-101-64x4d-FPN_s1x.yaml @@ -0,0 +1,56 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 130000 + STEPS: [0, 100000, 120000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: head_builder.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35999553/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml.08_21_33.ghFzzArr/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35999553/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml.08_21_33.ghFzzArr/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35999553/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml.08_21_33.ghFzzArr/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_R-101-FPN_1x.yaml b/configs/12_2017_baselines/mask_rcnn_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2b69697e840a044a61e344fe8bdb4a467ba05fbe --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_R-101-FPN_1x.yaml @@ -0,0 +1,45 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_R-101-FPN_2x.yaml b/configs/12_2017_baselines/mask_rcnn_R-101-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8561ffffcf8e7217d56cfd15c3f4a79b672390d0 --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_R-101-FPN_2x.yaml @@ -0,0 +1,45 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998887/12_2017_baselines/rpn_R-101-FPN_1x.yaml.08_07_07.vzhHEs0V/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_R-50-C4_1x.yaml b/configs/12_2017_baselines/mask_rcnn_R-50-C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b29b4440737561cb75f2769f160bd2d0120285ac --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_R-50-C4_1x.yaml @@ -0,0 +1,42 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +RPN: + SIZES: (32, 64, 128, 256, 512) +FAST_RCNN: + ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head + ROI_XFORM_METHOD: RoIAlign +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v0upshare + RESOLUTION: 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default: GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_train/rpn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_valminusminival/rpn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_minival/rpn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_R-50-C4_2x.yaml b/configs/12_2017_baselines/mask_rcnn_R-50-C4_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..06d07b671374ea2907efba890dfd3b052923e2cb --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_R-50-C4_2x.yaml @@ -0,0 +1,42 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +RPN: + SIZES: (32, 64, 128, 256, 512) +FAST_RCNN: + ROI_BOX_HEAD: ResNet.add_ResNet_roi_conv5_head + ROI_XFORM_METHOD: RoIAlign +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v0upshare + RESOLUTION: 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default: GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_train/rpn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_valminusminival/rpn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L/output/test/coco_2014_minival/rpn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_R-50-FPN_1x.yaml b/configs/12_2017_baselines/mask_rcnn_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14c7ae969583ebe52b5848b152ef958231c8e6dd --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_R-50-FPN_1x.yaml @@ -0,0 +1,45 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_R-50-FPN_2x.yaml b/configs/12_2017_baselines/mask_rcnn_R-50-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4778c037b0e560fa0e021304244416ffa4abdf44 --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_R-50-FPN_2x.yaml @@ -0,0 +1,45 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/mask_rcnn_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2d236267640fc661f40b791eeaf72b36056a3a82 --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,52 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_X-101-32x8d-FPN_2x.yaml b/configs/12_2017_baselines/mask_rcnn_X-101-32x8d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0a6532ec3afdcbdc99cbc087b32f365ea3e84d43 --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_X-101-32x8d-FPN_2x.yaml @@ -0,0 +1,52 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/36760102/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml.06_00_16.RWeBAniO/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/mask_rcnn_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2441d5ff9ec72c80932f5a536987da3454dca367 --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,53 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/mask_rcnn_X-101-64x4d-FPN_2x.yaml b/configs/12_2017_baselines/mask_rcnn_X-101-64x4d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..87e9e2dce80748014a2c68d0262022c3b7d96a56 --- /dev/null +++ b/configs/12_2017_baselines/mask_rcnn_X-101-64x4d-FPN_2x.yaml @@ -0,0 +1,53 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + # 2x schedule (note TRAIN.IMS_PER_BATCH: 1) + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 360000 + STEPS: [0, 240000, 320000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + # md5sum of weights pkl file: aa14062280226e48f569ef1c7212e7c7 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (800,) + MAX_SIZE: 1333 + IMS_PER_BATCH: 1 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998956/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml.08_08_41.Seh0psKz/output/test/coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/retinanet_R-101-FPN_1x.yaml b/configs/12_2017_baselines/retinanet_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c875328d92ea69d864494716950e18c27b0cfa71 --- /dev/null +++ b/configs/12_2017_baselines/retinanet_R-101-FPN_1x.yaml @@ -0,0 +1,41 @@ +MODEL: + TYPE: retinanet + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 7 + RPN_MIN_LEVEL: 3 + COARSEST_STRIDE: 128 + EXTRA_CONV_LEVELS: True +RETINANET: + RETINANET_ON: True + NUM_CONVS: 4 + ASPECT_RATIOS: (1.0, 2.0, 0.5) + SCALES_PER_OCTAVE: 3 + ANCHOR_SCALE: 4 + LOSS_GAMMA: 2.0 + LOSS_ALPHA: 0.25 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + RPN_STRADDLE_THRESH: -1 # default 0 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 10000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/retinanet_R-101-FPN_2x.yaml b/configs/12_2017_baselines/retinanet_R-101-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9a3f91aba4d3358b6e982cb8d5cbbff465376bce --- /dev/null +++ b/configs/12_2017_baselines/retinanet_R-101-FPN_2x.yaml @@ -0,0 +1,41 @@ +MODEL: + TYPE: retinanet + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 7 + RPN_MIN_LEVEL: 3 + COARSEST_STRIDE: 128 + EXTRA_CONV_LEVELS: True +RETINANET: + RETINANET_ON: True + NUM_CONVS: 4 + ASPECT_RATIOS: (1.0, 2.0, 0.5) + SCALES_PER_OCTAVE: 3 + ANCHOR_SCALE: 4 + LOSS_GAMMA: 2.0 + LOSS_ALPHA: 0.25 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + RPN_STRADDLE_THRESH: -1 # default 0 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 10000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/retinanet_R-50-FPN_1x.yaml b/configs/12_2017_baselines/retinanet_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..35271fa2ff92013f7cb507ee82c52cd6dfc854f3 --- /dev/null +++ b/configs/12_2017_baselines/retinanet_R-50-FPN_1x.yaml @@ -0,0 +1,41 @@ +MODEL: + TYPE: retinanet + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 7 + RPN_MIN_LEVEL: 3 + COARSEST_STRIDE: 128 + EXTRA_CONV_LEVELS: True +RETINANET: + RETINANET_ON: True + NUM_CONVS: 4 + ASPECT_RATIOS: (1.0, 2.0, 0.5) + SCALES_PER_OCTAVE: 3 + ANCHOR_SCALE: 4 + LOSS_GAMMA: 2.0 + LOSS_ALPHA: 0.25 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + RPN_STRADDLE_THRESH: -1 # default 0 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 10000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/retinanet_R-50-FPN_2x.yaml b/configs/12_2017_baselines/retinanet_R-50-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..21acf070f116c8752eb07744002c23e65ec01a8d --- /dev/null +++ b/configs/12_2017_baselines/retinanet_R-50-FPN_2x.yaml @@ -0,0 +1,41 @@ +MODEL: + TYPE: retinanet + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 7 + RPN_MIN_LEVEL: 3 + COARSEST_STRIDE: 128 + EXTRA_CONV_LEVELS: True +RETINANET: + RETINANET_ON: True + NUM_CONVS: 4 + ASPECT_RATIOS: (1.0, 2.0, 0.5) + SCALES_PER_OCTAVE: 3 + ANCHOR_SCALE: 4 + LOSS_GAMMA: 2.0 + LOSS_ALPHA: 0.25 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + RPN_STRADDLE_THRESH: -1 # default 0 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 10000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/retinanet_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/retinanet_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d06848160c866d12afc69b273abc68c42d97ecfa --- /dev/null +++ b/configs/12_2017_baselines/retinanet_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,46 @@ +MODEL: + TYPE: retinanet + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 7 + RPN_MIN_LEVEL: 3 + COARSEST_STRIDE: 128 + EXTRA_CONV_LEVELS: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +RETINANET: + RETINANET_ON: True + NUM_CONVS: 4 + ASPECT_RATIOS: (1.0, 2.0, 0.5) + SCALES_PER_OCTAVE: 3 + ANCHOR_SCALE: 4 + LOSS_GAMMA: 2.0 + LOSS_ALPHA: 0.25 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + RPN_STRADDLE_THRESH: -1 # default 0 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 10000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/retinanet_X-101-32x8d-FPN_2x.yaml b/configs/12_2017_baselines/retinanet_X-101-32x8d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..da71fb6443e56f8f12a63e2f812cd2c63b24d7b8 --- /dev/null +++ b/configs/12_2017_baselines/retinanet_X-101-32x8d-FPN_2x.yaml @@ -0,0 +1,46 @@ +MODEL: + TYPE: retinanet + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 7 + RPN_MIN_LEVEL: 3 + COARSEST_STRIDE: 128 + EXTRA_CONV_LEVELS: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +RETINANET: + RETINANET_ON: True + NUM_CONVS: 4 + ASPECT_RATIOS: (1.0, 2.0, 0.5) + SCALES_PER_OCTAVE: 3 + ANCHOR_SCALE: 4 + LOSS_GAMMA: 2.0 + LOSS_ALPHA: 0.25 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + RPN_STRADDLE_THRESH: -1 # default 0 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 10000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/retinanet_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/retinanet_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7ac1175ea6d26b566bdcb16e89e4ead659079e6b --- /dev/null +++ b/configs/12_2017_baselines/retinanet_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,46 @@ +MODEL: + TYPE: retinanet + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 7 + RPN_MIN_LEVEL: 3 + COARSEST_STRIDE: 128 + EXTRA_CONV_LEVELS: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +RETINANET: + RETINANET_ON: True + NUM_CONVS: 4 + ASPECT_RATIOS: (1.0, 2.0, 0.5) + SCALES_PER_OCTAVE: 3 + ANCHOR_SCALE: 4 + LOSS_GAMMA: 2.0 + LOSS_ALPHA: 0.25 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + RPN_STRADDLE_THRESH: -1 # default 0 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 10000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/retinanet_X-101-64x4d-FPN_2x.yaml b/configs/12_2017_baselines/retinanet_X-101-64x4d-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0c2d47431b58227ce48345d9d6e5afb3fd519073 --- /dev/null +++ b/configs/12_2017_baselines/retinanet_X-101-64x4d-FPN_2x.yaml @@ -0,0 +1,46 @@ +MODEL: + TYPE: retinanet + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 7 + RPN_MIN_LEVEL: 3 + COARSEST_STRIDE: 128 + EXTRA_CONV_LEVELS: True +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +RETINANET: + RETINANET_ON: True + NUM_CONVS: 4 + ASPECT_RATIOS: (1.0, 2.0, 0.5) + SCALES_PER_OCTAVE: 3 + ANCHOR_SCALE: 4 + LOSS_GAMMA: 2.0 + LOSS_ALPHA: 0.25 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + RPN_STRADDLE_THRESH: -1 # default 0 +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 10000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_R-101-FPN_1x.yaml b/configs/12_2017_baselines/rpn_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d85c06b57ad938787b8512f01f2e22b89679a0ca --- /dev/null +++ b/configs/12_2017_baselines/rpn_R-101-FPN_1x.yaml @@ -0,0 +1,32 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 6 + RPN_MIN_LEVEL: 2 + RPN_ANCHOR_START_SIZE: 32 + RPN_ASPECT_RATIOS: (0.5, 1, 2) +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('coco_2014_minival','coco_2014_train','coco_2014_valminusminival') + SCALE: 800 + MAX_SIZE: 1333 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_R-50-C4_1x.yaml b/configs/12_2017_baselines/rpn_R-50-C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ca1d4745cc21977dd60450bd3ee45ebe9a29ad1c --- /dev/null +++ b/configs/12_2017_baselines/rpn_R-50-C4_1x.yaml @@ -0,0 +1,26 @@ +MODEL: + TYPE: rpn + CONV_BODY: ResNet.add_ResNet50_conv4_body + NUM_CLASSES: 81 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +RPN: + SIZES: (32, 64, 128, 256, 512) +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('coco_2014_minival','coco_2014_train','coco_2014_valminusminival') + SCALE: 800 + MAX_SIZE: 1333 +USE_NCCL: False +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_R-50-FPN_1x.yaml b/configs/12_2017_baselines/rpn_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..90990fe33783cacece36246c1ea004849649ad5f --- /dev/null +++ b/configs/12_2017_baselines/rpn_R-50-FPN_1x.yaml @@ -0,0 +1,32 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 6 + RPN_MIN_LEVEL: 2 + RPN_ANCHOR_START_SIZE: 32 + RPN_ASPECT_RATIOS: (0.5, 1, 2) +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('coco_2014_minival','coco_2014_train','coco_2014_valminusminival') + SCALE: 800 + MAX_SIZE: 1333 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..98b78461648e9b6d82ac41447b6b7c8de6c50229 --- /dev/null +++ b/configs/12_2017_baselines/rpn_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,37 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 6 + RPN_MIN_LEVEL: 2 + RPN_ANCHOR_START_SIZE: 32 + RPN_ASPECT_RATIOS: (0.5, 1, 2) +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('coco_2014_minival','coco_2014_train','coco_2014_valminusminival') + SCALE: 800 + MAX_SIZE: 1333 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf92ba6a90c61ae75b68b43714df847559ec5531 --- /dev/null +++ b/configs/12_2017_baselines/rpn_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,37 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 81 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 6 + RPN_MIN_LEVEL: 2 + RPN_ANCHOR_START_SIZE: 32 + RPN_ASPECT_RATIOS: (0.5, 1, 2) +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('coco_2014_minival','coco_2014_train','coco_2014_valminusminival') + SCALE: 800 + MAX_SIZE: 1333 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml b/configs/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a1e8f4ba2f2d9b0bcf787627c66b994aa8746f90 --- /dev/null +++ b/configs/12_2017_baselines/rpn_person_only_R-101-FPN_1x.yaml @@ -0,0 +1,32 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 6 + RPN_MIN_LEVEL: 2 + RPN_ANCHOR_START_SIZE: 32 + RPN_ASPECT_RATIOS: (0.5, 1, 2) +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-101.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('keypoints_coco_2014_minival', 'keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival', 'keypoints_coco_2015_test') + SCALE: 800 + MAX_SIZE: 1333 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml b/configs/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..395a91de629b6a8ab1d6b1d6c4e5513a4e1ca89b --- /dev/null +++ b/configs/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml @@ -0,0 +1,32 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 2 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 6 + RPN_MIN_LEVEL: 2 + RPN_ANCHOR_START_SIZE: 32 + RPN_ASPECT_RATIOS: (0.5, 1, 2) +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('keypoints_coco_2014_minival', 'keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival', 'keypoints_coco_2015_test') + SCALE: 800 + MAX_SIZE: 1333 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml b/configs/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a797dda128516708e7f5ac7c96650d08c9ea1e24 --- /dev/null +++ b/configs/12_2017_baselines/rpn_person_only_X-101-32x8d-FPN_1x.yaml @@ -0,0 +1,37 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 6 + RPN_MIN_LEVEL: 2 + RPN_ANCHOR_START_SIZE: 32 + RPN_ASPECT_RATIOS: (0.5, 1, 2) +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/20171220/X-101-32x8d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('keypoints_coco_2014_minival', 'keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival', 'keypoints_coco_2015_test') + SCALE: 800 + MAX_SIZE: 1333 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml b/configs/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4f7b3082c66ce5fb3ebba84315598575a4495249 --- /dev/null +++ b/configs/12_2017_baselines/rpn_person_only_X-101-64x4d-FPN_1x.yaml @@ -0,0 +1,37 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet101_conv5_body + NUM_CLASSES: 2 + RPN_ONLY: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_RPN: True + RPN_MAX_LEVEL: 6 + RPN_MIN_LEVEL: 2 + RPN_ANCHOR_START_SIZE: 32 + RPN_ASPECT_RATIOS: (0.5, 1, 2) +RESNETS: + STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models + TRANS_FUNC: bottleneck_transformation + NUM_GROUPS: 64 + WIDTH_PER_GROUP: 4 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 +TEST: + DATASETS: ('keypoints_coco_2014_minival', 'keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival', 'keypoints_coco_2015_test') + SCALE: 800 + MAX_SIZE: 1333 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 2000 +OUTPUT_DIR: . diff --git a/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml b/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..83ea2acdb354d91edb27d6dbc6c1a3b6f19383e1 --- /dev/null +++ b/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml @@ -0,0 +1,54 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 1 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.0025 + GAMMA: 0.1 + MAX_ITER: 60000 + STEPS: [0, 30000, 40000] + # Equivalent schedules with... + # 1 GPU: + # BASE_LR: 0.0025 + # MAX_ITER: 60000 + # STEPS: [0, 30000, 40000] + # 2 GPUs: + # BASE_LR: 0.005 + # MAX_ITER: 30000 + # STEPS: [0, 15000, 20000] + # 4 GPUs: + # BASE_LR: 0.01 + # MAX_ITER: 15000 + # STEPS: [0, 7500, 10000] + # 8 GPUs: + # BASE_LR: 0.02 + # MAX_ITER: 7500 + # STEPS: [0, 3750, 5000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train',) + SCALES: (500,) + MAX_SIZE: 833 + BATCH_SIZE_PER_IM: 256 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 500 + MAX_SIZE: 833 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/getting_started/tutorial_2gpu_e2e_faster_rcnn_R-50-FPN.yaml b/configs/getting_started/tutorial_2gpu_e2e_faster_rcnn_R-50-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a8df147030efbe7c54cf6ea9aef48b99a9f10bca --- /dev/null +++ b/configs/getting_started/tutorial_2gpu_e2e_faster_rcnn_R-50-FPN.yaml @@ -0,0 +1,54 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 2 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.005 + GAMMA: 0.1 + MAX_ITER: 30000 + STEPS: [0, 15000, 20000] + # Equivalent schedules with... + # 1 GPU: + # BASE_LR: 0.0025 + # MAX_ITER: 60000 + # STEPS: [0, 30000, 40000] + # 2 GPUs: + # BASE_LR: 0.005 + # MAX_ITER: 30000 + # STEPS: [0, 15000, 20000] + # 4 GPUs: + # BASE_LR: 0.01 + # MAX_ITER: 15000 + # STEPS: [0, 7500, 10000] + # 8 GPUs: + # BASE_LR: 0.02 + # MAX_ITER: 7500 + # STEPS: [0, 3750, 5000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train',) + SCALES: (500,) + MAX_SIZE: 833 + BATCH_SIZE_PER_IM: 256 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 500 + MAX_SIZE: 833 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/getting_started/tutorial_4gpu_e2e_faster_rcnn_R-50-FPN.yaml b/configs/getting_started/tutorial_4gpu_e2e_faster_rcnn_R-50-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5ffffc665e21f50dc154cada1640a89303278985 --- /dev/null +++ b/configs/getting_started/tutorial_4gpu_e2e_faster_rcnn_R-50-FPN.yaml @@ -0,0 +1,54 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 4 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.01 + GAMMA: 0.1 + MAX_ITER: 15000 + STEPS: [0, 7500, 10000] + # Equivalent schedules with... + # 1 GPU: + # BASE_LR: 0.0025 + # MAX_ITER: 60000 + # STEPS: [0, 30000, 40000] + # 2 GPUs: + # BASE_LR: 0.005 + # MAX_ITER: 30000 + # STEPS: [0, 15000, 20000] + # 4 GPUs: + # BASE_LR: 0.01 + # MAX_ITER: 15000 + # STEPS: [0, 7500, 10000] + # 8 GPUs: + # BASE_LR: 0.02 + # MAX_ITER: 7500 + # STEPS: [0, 3750, 5000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train',) + SCALES: (500,) + MAX_SIZE: 833 + BATCH_SIZE_PER_IM: 256 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 500 + MAX_SIZE: 833 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/getting_started/tutorial_8gpu_e2e_faster_rcnn_R-50-FPN.yaml b/configs/getting_started/tutorial_8gpu_e2e_faster_rcnn_R-50-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..21ce1fe6defc7e0a9fc5655ef589b88f70867559 --- /dev/null +++ b/configs/getting_started/tutorial_8gpu_e2e_faster_rcnn_R-50-FPN.yaml @@ -0,0 +1,54 @@ +MODEL: + TYPE: generalized_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 7500 + STEPS: [0, 3750, 5000] + # Equivalent schedules with... + # 1 GPU: + # BASE_LR: 0.0025 + # MAX_ITER: 60000 + # STEPS: [0, 30000, 40000] + # 2 GPUs: + # BASE_LR: 0.005 + # MAX_ITER: 30000 + # STEPS: [0, 15000, 20000] + # 4 GPUs: + # BASE_LR: 0.01 + # MAX_ITER: 15000 + # STEPS: [0, 7500, 10000] + # 8 GPUs: + # BASE_LR: 0.02 + # MAX_ITER: 7500 + # STEPS: [0, 3750, 5000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train',) + SCALES: (500,) + MAX_SIZE: 833 + BATCH_SIZE_PER_IM: 256 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 500 + MAX_SIZE: 833 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 +OUTPUT_DIR: . diff --git a/configs/test_time_aug/e2e_mask_rcnn_R-50-FPN_2x.yaml b/configs/test_time_aug/e2e_mask_rcnn_R-50-FPN_2x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c2bfd3b583cb09a40a8767b6142dc6f0e46fb90f --- /dev/null +++ b/configs/test_time_aug/e2e_mask_rcnn_R-50-FPN_2x.yaml @@ -0,0 +1,79 @@ +MODEL: + TYPE: mask_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 81 + FASTER_RCNN: True + MASK_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 180000 + STEPS: [0, 120000, 160000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +MRCNN: + ROI_MASK_HEAD: mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs + RESOLUTION: 28 # (output mask resolution) default 14 + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 # default 7 + ROI_XFORM_SAMPLING_RATIO: 2 # default 0 + DILATION: 1 # default 2 + CONV_INIT: MSRAFill # default GaussianFill +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('coco_2014_train', 'coco_2014_valminusminival') + SCALES: (800,) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 + RPN_PRE_NMS_TOP_N: 2000 # Per FPN level +TEST: + DATASETS: ('coco_2014_minival',) + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + RPN_PRE_NMS_TOP_N: 1000 # Per FPN level + RPN_POST_NMS_TOP_N: 1000 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/35859007/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_2x.yaml.01_49_07.By8nQcCH/output/train/coco_2014_train:coco_2014_valminusminival/generalized_rcnn/model_final.pkl + + # -- Test time augmentation example -- # + BBOX_AUG: + ENABLED: True + SCORE_HEUR: UNION # AVG NOTE: cannot use AVG for e2e model + COORD_HEUR: UNION # AVG NOTE: cannot use AVG for e2e model + H_FLIP: True + SCALES: (400, 500, 600, 700, 900, 1000, 1100, 1200) + MAX_SIZE: 2000 + SCALE_H_FLIP: True + SCALE_SIZE_DEP: False + AREA_TH_LO: 2500 # 50^2 + AREA_TH_HI: 32400 # 180^2 + ASPECT_RATIOS: () + ASPECT_RATIO_H_FLIP: False + MASK_AUG: + ENABLED: True + HEUR: SOFT_AVG + H_FLIP: True + SCALES: (400, 500, 600, 700, 900, 1000, 1100, 1200) + MAX_SIZE: 2000 + SCALE_H_FLIP: True + SCALE_SIZE_DEP: False + AREA_TH: 32400 # 180^2 + ASPECT_RATIOS: () + ASPECT_RATIO_H_FLIP: False + BBOX_VOTE: + ENABLED: True + VOTE_TH: 0.9 + # -- Test time augmentation example -- # + +USE_NCCL: False +OUTPUT_DIR: . diff --git a/configs/test_time_aug/keypoint_rcnn_R-50-FPN_1x.yaml b/configs/test_time_aug/keypoint_rcnn_R-50-FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8e0d17e927e2f963458750fb27e34b80dbf2ac4a --- /dev/null +++ b/configs/test_time_aug/keypoint_rcnn_R-50-FPN_1x.yaml @@ -0,0 +1,77 @@ +MODEL: + TYPE: keypoint_rcnn + CONV_BODY: FPN.add_fpn_ResNet50_conv5_body + NUM_CLASSES: 2 + KEYPOINTS_ON: True +NUM_GPUS: 8 +SOLVER: + WEIGHT_DECAY: 0.0001 + LR_POLICY: steps_with_decay + BASE_LR: 0.02 + GAMMA: 0.1 + MAX_ITER: 90000 + STEPS: [0, 60000, 80000] +FPN: + FPN_ON: True + MULTILEVEL_ROIS: True + MULTILEVEL_RPN: True # accidentally True; disable in the future +FAST_RCNN: + ROI_BOX_HEAD: fast_rcnn_heads.add_roi_2mlp_head + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 7 + ROI_XFORM_SAMPLING_RATIO: 2 +KRCNN: + ROI_KEYPOINTS_HEAD: keypoint_rcnn_heads.add_roi_pose_head_v1convX + NUM_STACKED_CONVS: 8 + NUM_KEYPOINTS: 17 + USE_DECONV_OUTPUT: True + CONV_INIT: MSRAFill + CONV_HEAD_DIM: 512 + UP_SCALE: 2 + HEATMAP_SIZE: 56 # ROI_XFORM_RESOLUTION (14) * UP_SCALE (2) * USE_DECONV_OUTPUT (2) + ROI_XFORM_METHOD: RoIAlign + ROI_XFORM_RESOLUTION: 14 + ROI_XFORM_SAMPLING_RATIO: 2 + KEYPOINT_CONFIDENCE: bbox +TRAIN: + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/MSRA/R-50.pkl + DATASETS: ('keypoints_coco_2014_train', 'keypoints_coco_2014_valminusminival') + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_train/generalized_rcnn/rpn_proposals.pkl', 'https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_valminusminival/generalized_rcnn/rpn_proposals.pkl') + SCALES: (640, 672, 704, 736, 768, 800) + MAX_SIZE: 1333 + BATCH_SIZE_PER_IM: 512 +TEST: + DATASETS: ('keypoints_coco_2014_minival',) + PROPOSAL_FILES: ('https://dl.fbaipublicfiles.com/detectron/35998996/12_2017_baselines/rpn_person_only_R-50-FPN_1x.yaml.08_10_08.0ZWmJm6F/output/test/keypoints_coco_2014_minival/generalized_rcnn/rpn_proposals.pkl',) + PROPOSAL_LIMIT: 1000 + SCALE: 800 + MAX_SIZE: 1333 + NMS: 0.5 + WEIGHTS: https://dl.fbaipublicfiles.com/detectron/37651887/12_2017_baselines/keypoint_rcnn_R-50-FPN_s1x.yaml.20_01_40.FDjUQ7VX/output/train/keypoints_coco_2014_train:keypoints_coco_2014_valminusminival/generalized_rcnn/model_final.pkl + + # -- Test time augmentation example -- # + BBOX_AUG: + ENABLED: True + SCORE_HEUR: AVG + COORD_HEUR: AVG + H_FLIP: True + SCALES: (400, 500, 600, 700, 900, 1000, 1100, 1200) + MAX_SIZE: 2000 + SCALE_H_FLIP: True + SCALE_SIZE_DEP: False + AREA_TH_LO: 2500 # 50^2 + AREA_TH_HI: 32400 # 180^2 + KPS_AUG: + ENABLED: True + HEUR: HM_AVG + H_FLIP: True + SCALES: (400, 500, 600, 700, 900, 1000, 1100, 1200) + MAX_SIZE: 2000 + SCALE_H_FLIP: True + SCALE_SIZE_DEP: True + AREA_TH: 22500 # 150^2 + ASPECT_RATIOS: () + ASPECT_RATIO_H_FLIP: False + # -- Test time augmentation example -- # + +OUTPUT_DIR: . diff --git a/configs/tools/convert_cityscapes_to_coco.py b/configs/tools/convert_cityscapes_to_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3583eca1aef66f37dbc9621ff006cde5885233f2 --- /dev/null +++ b/configs/tools/convert_cityscapes_to_coco.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import h5py +import json +import os +import imageio +import sys + +import cityscapesscripts.evaluation.instances2dict_with_polygons as cs + +import detectron.utils.segms as segms_util +import detectron.utils.boxes as bboxs_util + + +def parse_args(): + parser = argparse.ArgumentParser(description='Convert dataset') + parser.add_argument( + '--dataset', help="cocostuff, cityscapes", default=None, type=str) + parser.add_argument( + '--outdir', help="output dir for json files", default=None, type=str) + parser.add_argument( + '--datadir', help="data dir for annotations to be converted", + default=None, type=str) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + return parser.parse_args() + + +def convert_coco_stuff_mat(data_dir, out_dir): + """Convert to png and save json with path. This currently only contains + the segmentation labels for objects+stuff in cocostuff - if we need to + combine with other labels from original COCO that will be a TODO.""" + sets = ['train', 'val'] + categories = [] + json_name = 'coco_stuff_%s.json' + ann_dict = {} + for data_set in sets: + file_list = os.path.join(data_dir, '%s.txt') + images = [] + with open(file_list % data_set) as f: + for img_id, img_name in enumerate(f): + img_name = img_name.replace('coco', 'COCO').strip('\n') + image = {} + mat_file = os.path.join( + data_dir, 'annotations/%s.mat' % img_name) + data = h5py.File(mat_file, 'r') + labelMap = data.get('S') + if len(categories) == 0: + labelNames = data.get('names') + for idx, n in enumerate(labelNames): + categories.append( + {"id": idx, "name": ''.join(chr(i) for i in data[ + n[0]])}) + ann_dict['categories'] = categories + imageio.imsave( + os.path.join(data_dir, img_name + '.png'), labelMap) + image['width'] = labelMap.shape[0] + image['height'] = labelMap.shape[1] + image['file_name'] = img_name + image['seg_file_name'] = img_name + image['id'] = img_id + images.append(image) + ann_dict['images'] = images + print("Num images: %s" % len(images)) + with open(os.path.join(out_dir, json_name % data_set), 'wb') as outfile: + outfile.write(json.dumps(ann_dict)) + + +# for Cityscapes +def getLabelID(self, instID): + if (instID < 1000): + return instID + else: + return int(instID / 1000) + + +def convert_cityscapes_instance_only( + data_dir, out_dir): + """Convert from cityscapes format to COCO instance seg format - polygons""" + sets = [ + 'gtFine_val', + # 'gtFine_train', + # 'gtFine_test', + + # 'gtCoarse_train', + # 'gtCoarse_val', + # 'gtCoarse_train_extra' + ] + ann_dirs = [ + 'gtFine_trainvaltest/gtFine/val', + # 'gtFine_trainvaltest/gtFine/train', + # 'gtFine_trainvaltest/gtFine/test', + + # 'gtCoarse/train', + # 'gtCoarse/train_extra', + # 'gtCoarse/val' + ] + json_name = 'instancesonly_filtered_%s.json' + ends_in = '%s_polygons.json' + img_id = 0 + ann_id = 0 + cat_id = 1 + category_dict = {} + + category_instancesonly = [ + 'person', + 'rider', + 'car', + 'truck', + 'bus', + 'train', + 'motorcycle', + 'bicycle', + ] + + for data_set, ann_dir in zip(sets, ann_dirs): + print('Starting %s' % data_set) + ann_dict = {} + images = [] + annotations = [] + ann_dir = os.path.join(data_dir, ann_dir) + for root, _, files in os.walk(ann_dir): + for filename in files: + if filename.endswith(ends_in % data_set.split('_')[0]): + if len(images) % 50 == 0: + print("Processed %s images, %s annotations" % ( + len(images), len(annotations))) + json_ann = json.load(open(os.path.join(root, filename))) + image = {} + image['id'] = img_id + img_id += 1 + + image['width'] = json_ann['imgWidth'] + image['height'] = json_ann['imgHeight'] + image['file_name'] = filename[:-len( + ends_in % data_set.split('_')[0])] + 'leftImg8bit.png' + image['seg_file_name'] = filename[:-len( + ends_in % data_set.split('_')[0])] + \ + '%s_instanceIds.png' % data_set.split('_')[0] + images.append(image) + + fullname = os.path.join(root, image['seg_file_name']) + objects = cs.instances2dict_with_polygons( + [fullname], verbose=False)[fullname] + + for object_cls in objects: + if object_cls not in category_instancesonly: + continue # skip non-instance categories + + for obj in objects[object_cls]: + if obj['contours'] == []: + print('Warning: empty contours.') + continue # skip non-instance categories + + len_p = [len(p) for p in obj['contours']] + if min(len_p) <= 4: + print('Warning: invalid contours.') + continue # skip non-instance categories + + ann = {} + ann['id'] = ann_id + ann_id += 1 + ann['image_id'] = image['id'] + ann['segmentation'] = obj['contours'] + + if object_cls not in category_dict: + category_dict[object_cls] = cat_id + cat_id += 1 + ann['category_id'] = category_dict[object_cls] + ann['iscrowd'] = 0 + ann['area'] = obj['pixelCount'] + ann['bbox'] = bboxs_util.xyxy_to_xywh( + segms_util.polys_to_boxes( + [ann['segmentation']])).tolist()[0] + + annotations.append(ann) + + ann_dict['images'] = images + categories = [{"id": category_dict[name], "name": name} for name in + category_dict] + ann_dict['categories'] = categories + ann_dict['annotations'] = annotations + print("Num categories: %s" % len(categories)) + print("Num images: %s" % len(images)) + print("Num annotations: %s" % len(annotations)) + with open(os.path.join(out_dir, json_name % data_set), 'wb') as outfile: + outfile.write(json.dumps(ann_dict)) + + +if __name__ == '__main__': + args = parse_args() + if args.dataset == "cityscapes_instance_only": + convert_cityscapes_instance_only(args.datadir, args.outdir) + elif args.dataset == "cocostuff": + convert_coco_stuff_mat(args.datadir, args.outdir) + else: + print("Dataset not supported: %s" % args.dataset) diff --git a/configs/tools/convert_coco_model_to_cityscapes.py b/configs/tools/convert_coco_model_to_cityscapes.py new file mode 100644 index 0000000000000000000000000000000000000000..11dec595af064db202b366351cabc7cffd16125c --- /dev/null +++ b/configs/tools/convert_coco_model_to_cityscapes.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Convert a detection model trained for COCO into a model that can be fine-tuned +# on cityscapes +# +# cityscapes_to_coco + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import numpy as np +import os +import sys + +import detectron.datasets.coco_to_cityscapes_id as cs +from detectron.utils.io import load_object +from detectron.utils.io import save_object + +NUM_CS_CLS = 9 +NUM_COCO_CLS = 81 + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert a COCO pre-trained model for use with Cityscapes') + parser.add_argument( + '--coco_model', dest='coco_model_file_name', + help='Pretrained network weights file path', + default=None, type=str) + parser.add_argument( + '--convert_func', dest='convert_func', + help='Blob conversion function', + default='cityscapes_to_coco', type=str) + parser.add_argument( + '--output', dest='out_file_name', + help='Output file path', + default=None, type=str) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + args = parser.parse_args() + return args + + +def convert_coco_blobs_to_cityscape_blobs(model_dict): + for k, v in model_dict['blobs'].items(): + if v.shape[0] == NUM_COCO_CLS or v.shape[0] == 4 * NUM_COCO_CLS: + coco_blob = model_dict['blobs'][k] + print( + 'Converting COCO blob {} with shape {}'. + format(k, coco_blob.shape) + ) + cs_blob = convert_coco_blob_to_cityscapes_blob( + coco_blob, args.convert_func + ) + print(' -> converted shape {}'.format(cs_blob.shape)) + model_dict['blobs'][k] = cs_blob + + +def convert_coco_blob_to_cityscapes_blob(coco_blob, convert_func): + # coco blob (81, ...) or (81*4, ...) + coco_shape = coco_blob.shape + leading_factor = int(coco_shape[0] / NUM_COCO_CLS) + tail_shape = list(coco_shape[1:]) + assert leading_factor == 1 or leading_factor == 4 + + # Reshape in [num_classes, ...] form for easier manipulations + coco_blob = coco_blob.reshape([NUM_COCO_CLS, -1] + tail_shape) + # Default initialization uses Gaussian with mean and std to match the + # existing parameters + std = coco_blob.std() + mean = coco_blob.mean() + cs_shape = [NUM_CS_CLS] + list(coco_blob.shape[1:]) + cs_blob = (np.random.randn(*cs_shape) * std + mean).astype(np.float32) + + # Replace random parameters with COCO parameters if class mapping exists + for i in range(NUM_CS_CLS): + coco_cls_id = getattr(cs, convert_func)(i) + if coco_cls_id >= 0: # otherwise ignore (rand init) + cs_blob[i] = coco_blob[coco_cls_id] + + cs_shape = [NUM_CS_CLS * leading_factor] + tail_shape + return cs_blob.reshape(cs_shape) + + +def remove_momentum(model_dict): + for k in model_dict['blobs'].keys(): + if k.endswith('_momentum'): + del model_dict['blobs'][k] + + +def load_and_convert_coco_model(args): + model_dict = load_object(args.coco_model_file_name) + remove_momentum(model_dict) + convert_coco_blobs_to_cityscape_blobs(model_dict) + return model_dict + + +if __name__ == '__main__': + args = parse_args() + print(args) + assert os.path.exists(args.coco_model_file_name), \ + 'Weights file does not exist' + weights = load_and_convert_coco_model(args) + + save_object(weights, args.out_file_name) + print('Wrote blobs to {}:'.format(args.out_file_name)) + print(sorted(weights['blobs'].keys())) diff --git a/configs/tools/convert_pkl_to_pb.py b/configs/tools/convert_pkl_to_pb.py new file mode 100644 index 0000000000000000000000000000000000000000..522d83896ed5c075cd4ac1c11c83abe3719d4325 --- /dev/null +++ b/configs/tools/convert_pkl_to_pb.py @@ -0,0 +1,696 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Script to convert the model (.yaml and .pkl) trained by train_net to a +standard Caffe2 model in pb format (model.pb and model_init.pb). The converted +model is good for production usage, as it could run independently and efficiently +on CPU, GPU and mobile without depending on the detectron codebase. + +Please see Caffe2 tutorial ( +https://caffe2.ai/docs/tutorial-loading-pre-trained-models.html) for loading +the converted model, and run_model_pb() for running the model for inference. +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import argparse +import copy +import os +import pprint +import sys + +import caffe2.python.utils as putils +import cv2 # NOQA (Must import before importing caffe2 due to bug in cv2) +import detectron.core.test_engine as test_engine +import detectron.utils.blob as blob_utils +import detectron.utils.c2 as c2_utils +import detectron.utils.model_convert_utils as mutils +import detectron.utils.vis as vis_utils +import numpy as np +from caffe2.caffe2.fb.predictor import predictor_exporter, predictor_py_utils +from caffe2.proto import caffe2_pb2 +from caffe2.python import core, workspace +from caffe2.python.predictor_constants import predictor_constants +from detectron.core.config import ( + assert_and_infer_cfg, + cfg, + merge_cfg_from_file, + merge_cfg_from_list, +) +from detectron.modeling import generate_anchors +from detectron.utils.logging import setup_logging +from detectron.utils.model_convert_utils import convert_op_in_proto, op_filter + + +c2_utils.import_contrib_ops() +c2_utils.import_detectron_ops() + +# OpenCL may be enabled by default in OpenCV3; disable it because it's not +# thread safe and causes unwanted GPU memory allocations. +cv2.ocl.setUseOpenCL(False) + +logger = setup_logging(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert a trained network to pb format" + ) + parser.add_argument( + "--cfg", dest="cfg_file", help="optional config file", default=None, type=str + ) + parser.add_argument( + "--net_name", + dest="net_name", + help="optional name for the net", + default="detectron", + type=str, + ) + parser.add_argument( + "--out_dir", dest="out_dir", help="output dir", default=None, type=str + ) + parser.add_argument( + "--test_img", + dest="test_img", + help="optional test image, used to verify the model conversion", + default=None, + type=str, + ) + parser.add_argument( + "--fuse_af", dest="fuse_af", help="1 to fuse_af", default=1, type=int + ) + parser.add_argument( + "--device", + dest="device", + help="Device to run the model on", + choices=["cpu", "gpu"], + default="cpu", + type=str, + ) + parser.add_argument( + "--net_execution_type", + dest="net_execution_type", + help="caffe2 net execution type", + choices=["simple", "dag"], + default="simple", + type=str, + ) + parser.add_argument( + "--use_nnpack", + dest="use_nnpack", + help="Use nnpack for conv", + default=1, + type=int, + ) + parser.add_argument( + "--logdb", + dest="logdb", + help="output to logfiledb instead of pb files", + default=0, + type=int, + ) + parser.add_argument( + "opts", + help="See detectron/core/config.py for all options", + default=None, + nargs=argparse.REMAINDER, + ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + ret = parser.parse_args() + ret.out_dir = os.path.abspath(ret.out_dir) + if ret.device == "gpu" and ret.use_nnpack: + logger.warn("Should not use mobile engine for gpu model.") + ret.use_nnpack = 0 + + return ret + + +def unscope_name(name): + return c2_utils.UnscopeName(name) + + +def reset_names(names): + for i in range(len(names)): + names[i] = unscope_name(names[i]) + + +def convert_collect_and_distribute( + op, + blobs, + roi_canonical_scale, + roi_canonical_level, + roi_max_level, + roi_min_level, + rpn_max_level, + rpn_min_level, + rpn_post_nms_topN, +): + print( + "Converting CollectAndDistributeFpnRpnProposals" + " Python -> C++:\n{}".format(op) + ) + assert op.name.startswith( + "CollectAndDistributeFpnRpnProposalsOp" + ), "Not valid CollectAndDistributeFpnRpnProposalsOp" + + inputs = [x for x in op.input] + ret = core.CreateOperator( + "CollectAndDistributeFpnRpnProposals", + inputs, + list(op.output), + roi_canonical_scale=roi_canonical_scale, + roi_canonical_level=roi_canonical_level, + roi_max_level=roi_max_level, + roi_min_level=roi_min_level, + rpn_max_level=rpn_max_level, + rpn_min_level=rpn_min_level, + rpn_post_nms_topN=rpn_post_nms_topN, + ) + return ret + + +def convert_gen_proposals( + op, blobs, rpn_pre_nms_topN, rpn_post_nms_topN, rpn_nms_thresh, rpn_min_size +): + print("Converting GenerateProposals Python -> C++:\n{}".format(op)) + assert op.name.startswith("GenerateProposalsOp"), "Not valid GenerateProposalsOp" + + spatial_scale = mutils.get_op_arg_valf(op, "spatial_scale", None) + assert spatial_scale is not None + + lvl = int(op.input[0][-1]) if op.input[0][-1].isdigit() else None + + inputs = [x for x in op.input] + anchor_name = "anchor{}".format(lvl) if lvl else "anchor" + inputs.append(anchor_name) + anchor_sizes = ( + (cfg.FPN.RPN_ANCHOR_START_SIZE * 2.0 ** (lvl - cfg.FPN.RPN_MIN_LEVEL),) + if lvl + else cfg.RPN.SIZES + ) + blobs[anchor_name] = get_anchors(spatial_scale, anchor_sizes) + print("anchors {}".format(blobs[anchor_name])) + + ret = core.CreateOperator( + "GenerateProposals", + inputs, + list(op.output), + spatial_scale=spatial_scale, + pre_nms_topN=rpn_pre_nms_topN, + post_nms_topN=rpn_post_nms_topN, + nms_thresh=rpn_nms_thresh, + min_size=rpn_min_size, + correct_transform_coords=True, + ) + return ret, anchor_name + + +def get_anchors(spatial_scale, anchor_sizes): + anchors = generate_anchors.generate_anchors( + stride=1.0 / spatial_scale, + sizes=anchor_sizes, + aspect_ratios=cfg.RPN.ASPECT_RATIOS, + ).astype(np.float32) + return anchors + + +def reset_blob_names(blobs): + ret = {unscope_name(x): blobs[x] for x in blobs} + blobs.clear() + blobs.update(ret) + + +def convert_net(args, net, blobs): + @op_filter() + def convert_op_name(op): + if args.device != "gpu": + if op.engine != "DEPTHWISE_3x3": + op.engine = "" + op.device_option.CopyFrom(caffe2_pb2.DeviceOption()) + reset_names(op.input) + reset_names(op.output) + return [op] + + @op_filter(type="Python") + def convert_python(op): + if op.name.startswith("GenerateProposalsOp"): + gen_proposals_op, ext_input = convert_gen_proposals( + op, + blobs, + rpn_min_size=float(cfg.TEST.RPN_MIN_SIZE), + rpn_post_nms_topN=cfg.TEST.RPN_POST_NMS_TOP_N, + rpn_pre_nms_topN=cfg.TEST.RPN_PRE_NMS_TOP_N, + rpn_nms_thresh=cfg.TEST.RPN_NMS_THRESH, + ) + net.external_input.extend([ext_input]) + return [gen_proposals_op] + elif op.name.startswith("CollectAndDistributeFpnRpnProposalsOp"): + collect_dist_op = convert_collect_and_distribute( + op, + blobs, + roi_canonical_scale=cfg.FPN.ROI_CANONICAL_SCALE, + roi_canonical_level=cfg.FPN.ROI_CANONICAL_LEVEL, + roi_max_level=cfg.FPN.ROI_MAX_LEVEL, + roi_min_level=cfg.FPN.ROI_MIN_LEVEL, + rpn_max_level=cfg.FPN.RPN_MAX_LEVEL, + rpn_min_level=cfg.FPN.RPN_MIN_LEVEL, + rpn_post_nms_topN=cfg.TEST.RPN_POST_NMS_TOP_N, + ) + return [collect_dist_op] + else: + raise ValueError("Failed to convert Python op {}".format(op.name)) + + # Only convert UpsampleNearest to ResizeNearest when converting to pb so that the existing models is unchanged + # https://github.com/facebookresearch/Detectron/pull/372#issuecomment-410248561 + @op_filter(type="UpsampleNearest") + def convert_upsample_nearest(op): + for arg in op.arg: + if arg.name == "scale": + scale = arg.i + break + else: + raise KeyError('No attribute "scale" in UpsampleNearest op') + resize_nearest_op = core.CreateOperator( + "ResizeNearest", + list(op.input), + list(op.output), + name=op.name, + width_scale=float(scale), + height_scale=float(scale), + ) + return resize_nearest_op + + @op_filter() + def convert_rpn_rois(op): + for j in range(len(op.input)): + if op.input[j] == "rois": + print( + "Converting op {} input name: rois -> rpn_rois:\n{}".format( + op.type, op + ) + ) + op.input[j] = "rpn_rois" + for j in range(len(op.output)): + if op.output[j] == "rois": + print( + "Converting op {} output name: rois -> rpn_rois:\n{}".format( + op.type, op + ) + ) + op.output[j] = "rpn_rois" + return [op] + + @op_filter(type_in=["StopGradient", "Alias"]) + def convert_remove_op(op): + print("Removing op {}:\n{}".format(op.type, op)) + return [] + + # We want to apply to all operators, including converted + # so run separately + convert_op_in_proto(net, convert_remove_op) + convert_op_in_proto(net, convert_upsample_nearest) + convert_op_in_proto(net, convert_python) + convert_op_in_proto(net, convert_op_name) + convert_op_in_proto(net, convert_rpn_rois) + + reset_names(net.external_input) + reset_names(net.external_output) + + reset_blob_names(blobs) + + +def add_bbox_ops(args, net, blobs): + new_ops = [] + new_external_outputs = [] + + # Operators for bboxes + op_box = core.CreateOperator( + "BBoxTransform", + ["rpn_rois", "bbox_pred", "im_info"], + ["pred_bbox"], + weights=cfg.MODEL.BBOX_REG_WEIGHTS, + apply_scale=False, + correct_transform_coords=True, + ) + new_ops.extend([op_box]) + + blob_prob = "cls_prob" + blob_box = "pred_bbox" + op_nms = core.CreateOperator( + "BoxWithNMSLimit", + [blob_prob, blob_box], + ["score_nms", "bbox_nms", "class_nms"], + arg=[ + putils.MakeArgument("score_thresh", cfg.TEST.SCORE_THRESH), + putils.MakeArgument("nms", cfg.TEST.NMS), + putils.MakeArgument("detections_per_im", cfg.TEST.DETECTIONS_PER_IM), + putils.MakeArgument("soft_nms_enabled", cfg.TEST.SOFT_NMS.ENABLED), + putils.MakeArgument("soft_nms_method", cfg.TEST.SOFT_NMS.METHOD), + putils.MakeArgument("soft_nms_sigma", cfg.TEST.SOFT_NMS.SIGMA), + ], + ) + new_ops.extend([op_nms]) + new_external_outputs.extend(["score_nms", "bbox_nms", "class_nms"]) + + net.Proto().op.extend(new_ops) + net.Proto().external_output.extend(new_external_outputs) + + +def convert_model_gpu(args, net, init_net): + assert args.device == "gpu" + + ret_net = copy.deepcopy(net) + ret_init_net = copy.deepcopy(init_net) + + cdo_cuda = mutils.get_device_option_cuda() + cdo_cpu = mutils.get_device_option_cpu() + + CPU_OPS = [ + ["CollectAndDistributeFpnRpnProposals", None], + ["GenerateProposals", None], + ["BBoxTransform", None], + ["BoxWithNMSLimit", None], + ] + CPU_BLOBS = ["im_info", "anchor"] + + @op_filter() + def convert_op_gpu(op): + for x in CPU_OPS: + if mutils.filter_op(op, type=x[0], inputs=x[1]): + return None + op.device_option.CopyFrom(cdo_cuda) + return [op] + + @op_filter() + def convert_init_op_gpu(op): + if op.output[0] in CPU_BLOBS: + op.device_option.CopyFrom(cdo_cpu) + else: + op.device_option.CopyFrom(cdo_cuda) + return [op] + + convert_op_in_proto(ret_init_net.Proto(), convert_init_op_gpu) + convert_op_in_proto(ret_net.Proto(), convert_op_gpu) + + ret = core.InjectDeviceCopiesAmongNets([ret_init_net, ret_net]) + + return [ret[0][1], ret[0][0]] + + +def gen_init_net(net, blobs, empty_blobs): + blobs = copy.deepcopy(blobs) + for x in empty_blobs: + blobs[x] = np.array([], dtype=np.float32) + init_net = mutils.gen_init_net_from_blobs(blobs, net.external_inputs) + init_net = core.Net(init_net) + return init_net + + +def _save_image_graphs(args, all_net, all_init_net): + print("Saving model graph...") + mutils.save_graph( + all_net.Proto(), os.path.join(args.out_dir, "model_def.png"), op_only=False + ) + print("Model def image saved to {}.".format(args.out_dir)) + + +def _save_models(all_net, all_init_net, args): + print("Writing converted model to {}...".format(args.out_dir)) + fname = "model" + + if not os.path.exists(args.out_dir): + os.makedirs(args.out_dir) + + with open(os.path.join(args.out_dir, fname + ".pb"), "wb") as f: + f.write(all_net.Proto().SerializeToString()) + with open(os.path.join(args.out_dir, fname + ".pbtxt"), "wb") as f: + f.write(str(all_net.Proto())) + with open(os.path.join(args.out_dir, fname + "_init.pb"), "wb") as f: + f.write(all_init_net.Proto().SerializeToString()) + + _save_image_graphs(args, all_net, all_init_net) + + +def load_model(args): + model = test_engine.initialize_model_from_cfg(cfg.TEST.WEIGHTS) + blobs = mutils.get_ws_blobs() + + return model, blobs + + +def _get_result_blobs(check_blobs): + ret = {} + for x in check_blobs: + sn = core.ScopedName(x) + if workspace.HasBlob(sn): + ret[x] = workspace.FetchBlob(sn) + else: + ret[x] = None + + return ret + + +def _sort_results(boxes, segms, keypoints, classes): + indices = np.argsort(boxes[:, -1])[::-1] + if boxes is not None: + boxes = boxes[indices, :] + if segms is not None: + segms = [segms[x] for x in indices] + if keypoints is not None: + keypoints = [keypoints[x] for x in indices] + if classes is not None: + if isinstance(classes, list): + classes = [classes[x] for x in indices] + else: + classes = classes[indices] + + return boxes, segms, keypoints, classes + + +def run_model_cfg(args, im, check_blobs): + workspace.ResetWorkspace() + model, _ = load_model(args) + with c2_utils.NamedCudaScope(0): + cls_boxes, cls_segms, cls_keyps = test_engine.im_detect_all( + model, im, None, None + ) + + boxes, segms, keypoints, classes = vis_utils.convert_from_cls_format( + cls_boxes, cls_segms, cls_keyps + ) + + # sort the results based on score for comparision + boxes, segms, keypoints, classes = _sort_results(boxes, segms, keypoints, classes) + + # write final results back to workspace + def _ornone(res): + return np.array(res) if res is not None else np.array([], dtype=np.float32) + + with c2_utils.NamedCudaScope(0): + workspace.FeedBlob(core.ScopedName("result_boxes"), _ornone(boxes)) + workspace.FeedBlob(core.ScopedName("result_segms"), _ornone(segms)) + workspace.FeedBlob(core.ScopedName("result_keypoints"), _ornone(keypoints)) + workspace.FeedBlob(core.ScopedName("result_classids"), _ornone(classes)) + + # get result blobs + with c2_utils.NamedCudaScope(0): + ret = _get_result_blobs(check_blobs) + + return ret + + +def _prepare_blobs(im, pixel_means, target_size, max_size): + """ Reference: blob.prep_im_for_blob() """ + + im = im.astype(np.float32, copy=False) + im -= pixel_means + im_shape = im.shape + + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + im_scale = float(target_size) / float(im_size_min) + if np.round(im_scale * im_size_max) > max_size: + im_scale = float(max_size) / float(im_size_max) + im = cv2.resize( + im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR + ) + + # Reuse code in blob_utils and fit FPN + blob = blob_utils.im_list_to_blob([im]) + + blobs = {} + blobs["data"] = blob + blobs["im_info"] = np.array( + [[blob.shape[2], blob.shape[3], im_scale]], dtype=np.float32 + ) + return blobs + + +def run_model_pb(args, net, init_net, im, check_blobs): + workspace.ResetWorkspace() + workspace.RunNetOnce(init_net) + mutils.create_input_blobs_for_net(net.Proto()) + workspace.CreateNet(net) + + # input_blobs, _ = core_test._get_blobs(im, None) + input_blobs = _prepare_blobs(im, cfg.PIXEL_MEANS, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE) + gpu_blobs = [] + if args.device == "gpu": + gpu_blobs = ["data"] + for k, v in input_blobs.items(): + workspace.FeedBlob( + core.ScopedName(k), + v, + mutils.get_device_option_cuda() + if k in gpu_blobs + else mutils.get_device_option_cpu(), + ) + + try: + workspace.RunNet(net) + scores = workspace.FetchBlob("score_nms") + classids = workspace.FetchBlob("class_nms") + boxes = workspace.FetchBlob("bbox_nms") + except Exception as e: + print("Running pb model failed.\n{}".format(e)) + # may not detect anything at all + R = 0 + scores = np.zeros((R,), dtype=np.float32) + boxes = np.zeros((R, 4), dtype=np.float32) + classids = np.zeros((R,), dtype=np.float32) + + boxes = np.column_stack((boxes, scores)) + + # sort the results based on score for comparision + boxes, _, _, classids = _sort_results(boxes, None, None, classids) + + # write final result back to workspace + workspace.FeedBlob("result_boxes", boxes) + workspace.FeedBlob("result_classids", classids) + + ret = _get_result_blobs(check_blobs) + + return ret + + +def verify_model(args, model_pb, test_img_file): + check_blobs = ["result_boxes", "result_classids"] # result + + print("Loading test file {}...".format(test_img_file)) + test_img = cv2.imread(test_img_file) + assert test_img is not None + + def _run_cfg_func(im, blobs): + return run_model_cfg(args, im, check_blobs) + + def _run_pb_func(im, blobs): + return run_model_pb(args, model_pb[0], model_pb[1], im, check_blobs) + + print("Checking models...") + assert mutils.compare_model(_run_cfg_func, _run_pb_func, test_img, check_blobs) + + +def _export_to_logfiledb(args, net, init_net, inputs, out_file, extra_out_tensors=None): + out_tensors = list(net.Proto().external_output) + if extra_out_tensors is not None: + out_tensors += extra_out_tensors + params = list(set(net.Proto().external_input) - set(inputs)) + net_type = None + predictor_export_meta = predictor_exporter.PredictorExportMeta( + predict_net=net, + parameters=params, + inputs=inputs, + outputs=out_tensors, + net_type=net_type, + ) + + logger.info("Exporting Caffe2 model to {}".format(out_file)) + predictor_exporter.save_to_db( + db_type="log_file_db", + db_destination=out_file, + predictor_export_meta=predictor_export_meta, + ) + + +def main(): + workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"]) + args = parse_args() + logger.info("Called with args:") + logger.info(args) + if args.cfg_file is not None: + merge_cfg_from_file(args.cfg_file) + if args.opts is not None: + merge_cfg_from_list(args.opts) + cfg.NUM_GPUS = 1 + assert_and_infer_cfg() + logger.info("Converting model with config:") + logger.info(pprint.pformat(cfg)) + + # script will stop when it can't find an operator rather + # than stopping based on these flags + # + # assert not cfg.MODEL.KEYPOINTS_ON, "Keypoint model not supported." + # assert not cfg.MODEL.MASK_ON, "Mask model not supported." + # assert not cfg.FPN.FPN_ON, "FPN not supported." + # assert not cfg.RETINANET.RETINANET_ON, "RetinaNet model not supported." + + # load model from cfg + model, blobs = load_model(args) + + net = core.Net("") + net.Proto().op.extend(copy.deepcopy(model.net.Proto().op)) + net.Proto().external_input.extend(copy.deepcopy(model.net.Proto().external_input)) + net.Proto().external_output.extend(copy.deepcopy(model.net.Proto().external_output)) + net.Proto().type = args.net_execution_type + net.Proto().num_workers = 1 if args.net_execution_type == "simple" else 4 + + # Reset the device_option, change to unscope name and replace python operators + convert_net(args, net.Proto(), blobs) + + # add operators for bbox + add_bbox_ops(args, net, blobs) + + if args.fuse_af: + print("Fusing affine channel...") + net, blobs = mutils.fuse_net_affine(net, blobs) + + if args.use_nnpack: + mutils.update_mobile_engines(net.Proto()) + + # generate init net + empty_blobs = ["data", "im_info"] + init_net = gen_init_net(net, blobs, empty_blobs) + + if args.device == "gpu": + [net, init_net] = convert_model_gpu(args, net, init_net) + + net.Proto().name = args.net_name + init_net.Proto().name = args.net_name + "_init" + + if args.test_img is not None: + verify_model(args, [net, init_net], args.test_img) + + if args.logdb == 1: + output_file = os.path.join(args.out_dir, "model.logfiledb") + _export_to_logfiledb(args, net, init_net, empty_blobs, output_file) + else: + _save_models(net, init_net, args) + +if __name__ == "__main__": + main() diff --git a/configs/tools/convert_selective_search.py b/configs/tools/convert_selective_search.py new file mode 100644 index 0000000000000000000000000000000000000000..c98ae74a23f8204c0ae9ddb9ec4b1f9c79a9f3ec --- /dev/null +++ b/configs/tools/convert_selective_search.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Script to convert Selective Search proposal boxes into the Detectron proposal +file format. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import scipy.io as sio +import sys + +from detectron.datasets.json_dataset import JsonDataset +from detectron.utils.io import save_object + + +if __name__ == '__main__': + dataset_name = sys.argv[1] + file_in = sys.argv[2] + file_out = sys.argv[3] + + ds = JsonDataset(dataset_name) + roidb = ds.get_roidb() + raw_data = sio.loadmat(file_in)['boxes'].ravel() + assert raw_data.shape[0] == len(roidb) + + boxes = [] + scores = [] + ids = [] + for i in range(raw_data.shape[0]): + if i % 1000 == 0: + print('{}/{}'.format(i + 1, len(roidb))) + # selective search boxes are 1-indexed and (y1, x1, y2, x2) + i_boxes = raw_data[i][:, (1, 0, 3, 2)] - 1 + boxes.append(i_boxes.astype(np.float32)) + scores.append(np.zeros((i_boxes.shape[0]), dtype=np.float32)) + ids.append(roidb[i]['id']) + + save_object(dict(boxes=boxes, scores=scores, indexes=ids), file_out) diff --git a/configs/tools/generate_testdev_from_test.py b/configs/tools/generate_testdev_from_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9d4b515cc0be732675ca1ca51d31649c93754bcb --- /dev/null +++ b/configs/tools/generate_testdev_from_test.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Given a full set of results (boxes, masks, or keypoints) on the 2017 COCO +test set, this script extracts the results subset that corresponds to 2017 +test-dev. The test-dev subset can then be submitted to the COCO evaluation +server. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import json +import os +import sys + +from detectron.datasets.dataset_catalog import get_ann_fn +from detectron.utils.timer import Timer + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--json', dest='json_file', + help='detections json file', + default='', type=str) + parser.add_argument( + '--output-dir', dest='output_dir', + help='output directory', + default='/tmp', type=str) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + args = parser.parse_args() + return args + + +def convert(json_file, output_dir): + print('Reading: {}'.format(json_file)) + with open(json_file, 'r') as fid: + dt = json.load(fid) + print('done!') + + test_image_info = get_ann_fn('coco_2017_test') + with open(test_image_info, 'r') as fid: + info_test = json.load(fid) + image_test = info_test['images'] + image_test_id = [i['id'] for i in image_test] + print('{} has {} images'.format(test_image_info, len(image_test_id))) + + test_dev_image_info = get_ann_fn('coco_2017_test-dev') + with open(test_dev_image_info, 'r') as fid: + info_testdev = json.load(fid) + image_testdev = info_testdev['images'] + image_testdev_id = [i['id'] for i in image_testdev] + print('{} has {} images'.format(test_dev_image_info, len(image_testdev_id))) + + dt_testdev = [] + print('Filtering test-dev from test...') + t = Timer() + t.tic() + for i in range(len(dt)): + if i % 1000 == 0: + print('{}/{}'.format(i, len(dt))) + if dt[i]['image_id'] in image_testdev_id: + dt_testdev.append(dt[i]) + print('Done filtering ({:2}s)!'.format(t.toc())) + + filename, file_extension = os.path.splitext(os.path.basename(json_file)) + filename = filename + '_test-dev' + filename = os.path.join(output_dir, filename + file_extension) + with open(filename, 'w') as fid: + info_test = json.dump(dt_testdev, fid) + print('Done writing: {}!'.format(filename)) + + +if __name__ == '__main__': + opts = parse_args() + convert(opts.json_file, opts.output_dir) diff --git a/configs/tools/infer.py b/configs/tools/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..1c0199624a0f7882ef89c28205ebe949d2b30cb5 --- /dev/null +++ b/configs/tools/infer.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Perform inference on a single image or all images with a certain extension +(e.g., .jpg) in a folder. Allows for using a combination of multiple models. +For example, one model may be used for RPN, another model for Fast R-CNN style +box detection, yet another model to predict masks, and yet another model to +predict keypoints. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import cv2 # NOQA (Must import before importing caffe2 due to bug in cv2) +import logging +import os +import sys + +from caffe2.python import workspace + +from detectron.core.config import assert_and_infer_cfg +from detectron.core.config import cfg +from detectron.core.config import load_cfg +from detectron.core.config import merge_cfg_from_cfg +from detectron.core.config import merge_cfg_from_file +from detectron.utils.io import cache_url +from detectron.utils.logging import setup_logging +import detectron.core.rpn_generator as rpn_engine +import detectron.core.test_engine as model_engine +import detectron.datasets.dummy_datasets as dummy_datasets +import detectron.utils.c2 as c2_utils +import detectron.utils.env as envu +import detectron.utils.vis as vis_utils + +c2_utils.import_detectron_ops() + +# OpenCL may be enabled by default in OpenCV3; disable it because it's not +# thread safe and causes unwanted GPU memory allocations. +cv2.ocl.setUseOpenCL(False) + +# infer.py +# --im [path/to/image.jpg] \ +# --rpn-model [path/to/rpn/model.pkl] \ +# --rpn-cfg [path/to/rpn/config.yaml] \ +# --output-dir [path/to/output/dir] \ +# [model1] [config1] [model2] [config2] ... + + +def parse_args(): + parser = argparse.ArgumentParser(description='Inference on an image') + parser.add_argument( + '--im', dest='im_file', help='input image', default=None, type=str + ) + parser.add_argument( + '--rpn-pkl', + dest='rpn_pkl', + help='rpn model file (pkl)', + default=None, + type=str + ) + parser.add_argument( + '--rpn-cfg', + dest='rpn_cfg', + help='cfg model file (yaml)', + default=None, + type=str + ) + parser.add_argument( + '--output-dir', + dest='output_dir', + help='directory for visualization pdfs (default: /tmp/infer)', + default='/tmp/infer', + type=str + ) + parser.add_argument( + 'models_to_run', + help='pairs of models & configs, listed like so: [pkl1] [yaml1] [pkl2] [yaml2] ...', + default=None, + nargs=argparse.REMAINDER + ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + return parser.parse_args() + + +def get_rpn_box_proposals(im, args): + cfg.immutable(False) + merge_cfg_from_file(args.rpn_cfg) + cfg.NUM_GPUS = 1 + cfg.MODEL.RPN_ONLY = True + cfg.TEST.RPN_PRE_NMS_TOP_N = 10000 + cfg.TEST.RPN_POST_NMS_TOP_N = 2000 + assert_and_infer_cfg(cache_urls=False) + + model = model_engine.initialize_model_from_cfg(args.rpn_pkl) + with c2_utils.NamedCudaScope(0): + boxes, scores = rpn_engine.im_proposals(model, im) + return boxes, scores + + +def main(args): + logger = logging.getLogger(__name__) + dummy_coco_dataset = dummy_datasets.get_coco_dataset() + cfg_orig = load_cfg(envu.yaml_dump(cfg)) + im = cv2.imread(args.im_file) + + if args.rpn_pkl is not None: + proposal_boxes, _proposal_scores = get_rpn_box_proposals(im, args) + workspace.ResetWorkspace() + else: + proposal_boxes = None + + cls_boxes, cls_segms, cls_keyps = None, None, None + for i in range(0, len(args.models_to_run), 2): + pkl = args.models_to_run[i] + yml = args.models_to_run[i + 1] + cfg.immutable(False) + merge_cfg_from_cfg(cfg_orig) + merge_cfg_from_file(yml) + if len(pkl) > 0: + weights_file = pkl + else: + weights_file = cfg.TEST.WEIGHTS + cfg.NUM_GPUS = 1 + assert_and_infer_cfg(cache_urls=False) + model = model_engine.initialize_model_from_cfg(weights_file) + with c2_utils.NamedCudaScope(0): + cls_boxes_, cls_segms_, cls_keyps_ = \ + model_engine.im_detect_all(model, im, proposal_boxes) + cls_boxes = cls_boxes_ if cls_boxes_ is not None else cls_boxes + cls_segms = cls_segms_ if cls_segms_ is not None else cls_segms + cls_keyps = cls_keyps_ if cls_keyps_ is not None else cls_keyps + workspace.ResetWorkspace() + + out_name = os.path.join( + args.output_dir, '{}'.format(os.path.basename(args.im_file) + '.pdf') + ) + logger.info('Processing {} -> {}'.format(args.im_file, out_name)) + + vis_utils.vis_one_image( + im[:, :, ::-1], + args.im_file, + args.output_dir, + cls_boxes, + cls_segms, + cls_keyps, + dataset=dummy_coco_dataset, + box_alpha=0.3, + show_class=True, + thresh=0.7, + kp_thresh=2 + ) + + +def check_args(args): + assert ( + (args.rpn_pkl is not None and args.rpn_cfg is not None) or + (args.rpn_pkl is None and args.rpn_cfg is None) + ) + if args.rpn_pkl is not None: + args.rpn_pkl = cache_url(args.rpn_pkl, cfg.DOWNLOAD_CACHE) + assert os.path.exists(args.rpn_pkl) + assert os.path.exists(args.rpn_cfg) + if args.models_to_run is not None: + assert len(args.models_to_run) % 2 == 0 + for i, model_file in enumerate(args.models_to_run): + if len(model_file) > 0: + if i % 2 == 0: + model_file = cache_url(model_file, cfg.DOWNLOAD_CACHE) + args.models_to_run[i] = model_file + assert os.path.exists(model_file), \ + '\'{}\' does not exist'.format(model_file) + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + setup_logging(__name__) + args = parse_args() + check_args(args) + main(args) diff --git a/configs/tools/infer_simple.py b/configs/tools/infer_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..63506babfaadc22efe4c60245c5f9aacf3737638 --- /dev/null +++ b/configs/tools/infer_simple.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Perform inference on a single image or all images with a certain extension +(e.g., .jpg) in a folder. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import defaultdict +import argparse +import cv2 # NOQA (Must import before importing caffe2 due to bug in cv2) +import glob +import logging +import os +import sys +import time + +from caffe2.python import workspace + +from detectron.core.config import assert_and_infer_cfg +from detectron.core.config import cfg +from detectron.core.config import merge_cfg_from_file +from detectron.utils.io import cache_url +from detectron.utils.logging import setup_logging +from detectron.utils.timer import Timer +import detectron.core.test_engine as infer_engine +import detectron.datasets.dummy_datasets as dummy_datasets +import detectron.utils.c2 as c2_utils +import detectron.utils.vis as vis_utils + +c2_utils.import_detectron_ops() + +# OpenCL may be enabled by default in OpenCV3; disable it because it's not +# thread safe and causes unwanted GPU memory allocations. +cv2.ocl.setUseOpenCL(False) + + +def parse_args(): + parser = argparse.ArgumentParser(description='End-to-end inference') + parser.add_argument( + '--cfg', + dest='cfg', + help='cfg model file (/path/to/model_config.yaml)', + default=None, + type=str + ) + parser.add_argument( + '--wts', + dest='weights', + help='weights model file (/path/to/model_weights.pkl)', + default=None, + type=str + ) + parser.add_argument( + '--output-dir', + dest='output_dir', + help='directory for visualization pdfs (default: /tmp/infer_simple)', + default='/tmp/infer_simple', + type=str + ) + parser.add_argument( + '--image-ext', + dest='image_ext', + help='image file name extension (default: jpg)', + default='jpg', + type=str + ) + parser.add_argument( + '--always-out', + dest='out_when_no_box', + help='output image even when no object is found', + action='store_true' + ) + parser.add_argument( + '--output-ext', + dest='output_ext', + help='output image file format (default: pdf)', + default='pdf', + type=str + ) + parser.add_argument( + '--thresh', + dest='thresh', + help='Threshold for visualizing detections', + default=0.7, + type=float + ) + parser.add_argument( + '--kp-thresh', + dest='kp_thresh', + help='Threshold for visualizing keypoints', + default=2.0, + type=float + ) + parser.add_argument( + 'im_or_folder', help='image or folder of images', default=None + ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + return parser.parse_args() + + +def main(args): + logger = logging.getLogger(__name__) + + merge_cfg_from_file(args.cfg) + cfg.NUM_GPUS = 1 + args.weights = cache_url(args.weights, cfg.DOWNLOAD_CACHE) + assert_and_infer_cfg(cache_urls=False) + + assert not cfg.MODEL.RPN_ONLY, \ + 'RPN models are not supported' + assert not cfg.TEST.PRECOMPUTED_PROPOSALS, \ + 'Models that require precomputed proposals are not supported' + + model = infer_engine.initialize_model_from_cfg(args.weights) + dummy_coco_dataset = dummy_datasets.get_coco_dataset() + + if os.path.isdir(args.im_or_folder): + im_list = glob.iglob(args.im_or_folder + '/*.' + args.image_ext) + else: + im_list = [args.im_or_folder] + + for i, im_name in enumerate(im_list): + out_name = os.path.join( + args.output_dir, '{}'.format(os.path.basename(im_name) + '.' + args.output_ext) + ) + logger.info('Processing {} -> {}'.format(im_name, out_name)) + im = cv2.imread(im_name) + timers = defaultdict(Timer) + t = time.time() + with c2_utils.NamedCudaScope(0): + cls_boxes, cls_segms, cls_keyps = infer_engine.im_detect_all( + model, im, None, timers=timers + ) + logger.info('Inference time: {:.3f}s'.format(time.time() - t)) + for k, v in timers.items(): + logger.info(' | {}: {:.3f}s'.format(k, v.average_time)) + if i == 0: + logger.info( + ' \ Note: inference on the first image will be slower than the ' + 'rest (caches and auto-tuning need to warm up)' + ) + + vis_utils.vis_one_image( + im[:, :, ::-1], # BGR -> RGB for visualization + im_name, + args.output_dir, + cls_boxes, + cls_segms, + cls_keyps, + dataset=dummy_coco_dataset, + box_alpha=0.3, + show_class=True, + thresh=args.thresh, + kp_thresh=args.kp_thresh, + ext=args.output_ext, + out_when_no_box=args.out_when_no_box + ) + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + setup_logging(__name__) + args = parse_args() + main(args) diff --git a/configs/tools/pickle_caffe_blobs.py b/configs/tools/pickle_caffe_blobs.py new file mode 100644 index 0000000000000000000000000000000000000000..e8dc238f421bece6c5ef1bcf028f66c34b42ed8f --- /dev/null +++ b/configs/tools/pickle_caffe_blobs.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Script for converting Caffe (<= 1.0) models into the the simple state dict +format used by Detectron. For example, this script can convert the orignal +ResNet models released by MSRA. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import numpy as np +import os +import sys + +from caffe.proto import caffe_pb2 +from caffe2.proto import caffe2_pb2 +from caffe2.python import caffe_translator +from caffe2.python import utils +from google.protobuf import text_format + +from detectron.utils.io import save_object + +def parse_args(): + parser = argparse.ArgumentParser( + description='Dump weights from a Caffe model' + ) + parser.add_argument( + '--prototxt', + dest='prototxt_file_name', + help='Network definition prototxt file path', + default=None, + type=str + ) + parser.add_argument( + '--caffemodel', + dest='caffemodel_file_name', + help='Pretrained network weights file path', + default=None, + type=str + ) + parser.add_argument( + '--output', + dest='out_file_name', + help='Output file path', + default=None, + type=str + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + args = parser.parse_args() + return args + + +def normalize_resnet_name(name): + if name.find('res') == 0 and name.find('res_') == -1: + # E.g., + # res4b11_branch2c -> res4_11_branch2c + # res2a_branch1 -> res2_0_branch1 + chunk = name[len('res'):name.find('_')] + name = ( + 'res' + chunk[0] + '_' + str( + int(chunk[2:]) if len(chunk) > 2 # e.g., "b1" -> 1 + else ord(chunk[1]) - ord('a') + ) + # e.g., "a" -> 0 + name[name.find('_'):] + ) + return name + + +def pickle_weights(out_file_name, weights): + blobs = { + normalize_resnet_name(blob.name): utils.Caffe2TensorToNumpyArray(blob) + for blob in weights.protos + } + save_object(blobs, out_file_name) + print('Wrote blobs:') + print(sorted(blobs.keys())) + + +def add_missing_biases(caffenet_weights): + for layer in caffenet_weights.layer: + if layer.type == 'Convolution' and len(layer.blobs) == 1: + num_filters = layer.blobs[0].shape.dim[0] + bias_blob = caffe_pb2.BlobProto() + bias_blob.data.extend(np.zeros(num_filters)) + bias_blob.num, bias_blob.channels, bias_blob.height = 1, 1, 1 + bias_blob.width = num_filters + layer.blobs.extend([bias_blob]) + + +def remove_spatial_bn_layers(caffenet, caffenet_weights): + # Layer types associated with spatial batch norm + remove_types = ['BatchNorm', 'Scale'] + + def _remove_layers(net): + for i in reversed(range(len(net.layer))): + if net.layer[i].type in remove_types: + net.layer.pop(i) + + # First remove layers from caffenet proto + _remove_layers(caffenet) + # We'll return these so we can save the batch norm parameters + bn_layers = [ + layer for layer in caffenet_weights.layer if layer.type in remove_types + ] + _remove_layers(caffenet_weights) + + def _create_tensor(arr, shape, name): + t = caffe2_pb2.TensorProto() + t.name = name + t.data_type = caffe2_pb2.TensorProto.FLOAT + t.dims.extend(shape.dim) + t.float_data.extend(arr) + assert len(t.float_data) == np.prod(t.dims), 'Data size, shape mismatch' + return t + + bn_tensors = [] + for (bn, scl) in zip(bn_layers[0::2], bn_layers[1::2]): + assert bn.name[len('bn'):] == scl.name[len('scale'):], 'Pair mismatch' + blob_out = 'res' + bn.name[len('bn'):] + '_bn' + bn_mean = np.asarray(bn.blobs[0].data) + bn_var = np.asarray(bn.blobs[1].data) + scale = np.asarray(scl.blobs[0].data) + bias = np.asarray(scl.blobs[1].data) + std = np.sqrt(bn_var + 1e-5) + new_scale = scale / std + new_bias = bias - bn_mean * scale / std + new_scale_tensor = _create_tensor( + new_scale, bn.blobs[0].shape, blob_out + '_s' + ) + new_bias_tensor = _create_tensor( + new_bias, bn.blobs[0].shape, blob_out + '_b' + ) + bn_tensors.extend([new_scale_tensor, new_bias_tensor]) + return bn_tensors + + +def remove_layers_without_parameters(caffenet, caffenet_weights): + for i in reversed(range(len(caffenet_weights.layer))): + if len(caffenet_weights.layer[i].blobs) == 0: + # Search for the corresponding layer in caffenet and remove it + name = caffenet_weights.layer[i].name + found = False + for j in range(len(caffenet.layer)): + if caffenet.layer[j].name == name: + caffenet.layer.pop(j) + found = True + break + if not found and name[-len('_split'):] != '_split': + print('Warning: layer {} not found in caffenet'.format(name)) + caffenet_weights.layer.pop(i) + + +def normalize_shape(caffenet_weights): + for layer in caffenet_weights.layer: + for blob in layer.blobs: + shape = (blob.num, blob.channels, blob.height, blob.width) + if len(blob.data) != np.prod(shape): + shape = tuple(blob.shape.dim) + if len(shape) == 1: + # Handle biases + shape = (1, 1, 1, shape[0]) + if len(shape) == 2: + # Handle InnerProduct layers + shape = (1, 1, shape[0], shape[1]) + assert len(shape) == 4 + blob.num, blob.channels, blob.height, blob.width = shape + + +def load_and_convert_caffe_model(prototxt_file_name, caffemodel_file_name): + caffenet = caffe_pb2.NetParameter() + caffenet_weights = caffe_pb2.NetParameter() + text_format.Merge(open(prototxt_file_name).read(), caffenet) + caffenet_weights.ParseFromString(open(caffemodel_file_name).read()) + # C2 conv layers current require biases, but they are optional in C1 + # Add zeros as biases is they are missing + add_missing_biases(caffenet_weights) + # We only care about getting parameters, so remove layers w/o parameters + remove_layers_without_parameters(caffenet, caffenet_weights) + # BatchNorm is not implemented in the translator *and* we need to fold Scale + # layers into the new C2 SpatialBN op, hence we remove the batch norm layers + # and apply custom translations code + bn_weights = remove_spatial_bn_layers(caffenet, caffenet_weights) + # Set num, channel, height and width for blobs that use shape.dim instead + normalize_shape(caffenet_weights) + # Translate the rest of the model + net, pretrained_weights = caffe_translator.TranslateModel( + caffenet, caffenet_weights + ) + pretrained_weights.protos.extend(bn_weights) + return net, pretrained_weights + + +if __name__ == '__main__': + args = parse_args() + assert os.path.exists(args.prototxt_file_name), \ + 'Prototxt file does not exist' + assert os.path.exists(args.caffemodel_file_name), \ + 'Weights file does not exist' + net, weights = load_and_convert_caffe_model( + args.prototxt_file_name, args.caffemodel_file_name + ) + pickle_weights(args.out_file_name, weights) diff --git a/configs/tools/reval.py b/configs/tools/reval.py new file mode 100644 index 0000000000000000000000000000000000000000..c8138a97db7971d567379ce5db04749389ea9333 --- /dev/null +++ b/configs/tools/reval.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Reval = re-eval. Re-evaluate saved detections.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import os +import sys + +from detectron.core.config import cfg +from detectron.datasets import task_evaluation +from detectron.datasets.json_dataset import JsonDataset +from detectron.utils.io import load_object +from detectron.utils.logging import setup_logging +import detectron.core.config as core_config + + +def parse_args(): + parser = argparse.ArgumentParser(description='Re-evaluate results') + parser.add_argument( + 'output_dir', nargs=1, help='results directory', type=str + ) + parser.add_argument( + '--dataset', + dest='dataset_name', + help='dataset to re-evaluate', + default='voc_2007_test', + type=str + ) + parser.add_argument( + '--matlab', + dest='matlab_eval', + help='use matlab for evaluation', + action='store_true' + ) + parser.add_argument( + '--comp', + dest='comp_mode', + help='competition mode', + action='store_true' + ) + parser.add_argument( + '--cfg', + dest='cfg_file', + help='optional config file', + default=None, + type=str + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + args = parser.parse_args() + return args + + +def do_reval(dataset_name, output_dir, args): + dataset = JsonDataset(dataset_name) + dets = load_object(os.path.join(output_dir, 'detections.pkl')) + + # Override config with the one saved in the detections file + if args.cfg_file is not None: + core_config.merge_cfg_from_cfg(core_config.load_cfg(dets['cfg'])) + else: + core_config._merge_a_into_b(core_config.load_cfg(dets['cfg']), cfg) + results = task_evaluation.evaluate_all( + dataset, + dets['all_boxes'], + dets['all_segms'], + dets['all_keyps'], + output_dir, + use_matlab=args.matlab_eval + ) + task_evaluation.log_copy_paste_friendly_results(results) + + +if __name__ == '__main__': + setup_logging(__name__) + args = parse_args() + if args.comp_mode: + cfg.TEST.COMPETITION_MODE = True + output_dir = os.path.abspath(args.output_dir[0]) + do_reval(args.dataset_name, output_dir, args) diff --git a/configs/tools/test_net.py b/configs/tools/test_net.py new file mode 100644 index 0000000000000000000000000000000000000000..4afa4c60522c96d29b85af8373de669e4c15dc4e --- /dev/null +++ b/configs/tools/test_net.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Perform inference on one or more datasets.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import cv2 # NOQA (Must import before importing caffe2 due to bug in cv2) +import os +import pprint +import sys +import time + +from caffe2.python import workspace + +from detectron.core.config import assert_and_infer_cfg +from detectron.core.config import cfg +from detectron.core.config import merge_cfg_from_file +from detectron.core.config import merge_cfg_from_list +from detectron.core.test_engine import run_inference +from detectron.utils.logging import setup_logging +import detectron.utils.c2 as c2_utils + +c2_utils.import_detectron_ops() + +# OpenCL may be enabled by default in OpenCV3; disable it because it's not +# thread safe and causes unwanted GPU memory allocations. +cv2.ocl.setUseOpenCL(False) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') + parser.add_argument( + '--cfg', + dest='cfg_file', + help='optional config file', + default=None, + type=str + ) + parser.add_argument( + '--wait', + dest='wait', + help='wait until net file exists', + default=True, + type=bool + ) + parser.add_argument( + '--vis', dest='vis', help='visualize detections', action='store_true' + ) + parser.add_argument( + '--multi-gpu-testing', + dest='multi_gpu_testing', + help='using cfg.NUM_GPUS for inference', + action='store_true' + ) + parser.add_argument( + '--range', + dest='range', + help='start (inclusive) and end (exclusive) indices', + default=None, + type=int, + nargs=2 + ) + parser.add_argument( + 'opts', + help='See detectron/core/config.py for all options', + default=None, + nargs=argparse.REMAINDER + ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + return parser.parse_args() + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + logger = setup_logging(__name__) + args = parse_args() + logger.info('Called with args:') + logger.info(args) + if args.cfg_file is not None: + merge_cfg_from_file(args.cfg_file) + if args.opts is not None: + merge_cfg_from_list(args.opts) + assert_and_infer_cfg() + logger.info('Testing with config:') + logger.info(pprint.pformat(cfg)) + + while not os.path.exists(cfg.TEST.WEIGHTS) and args.wait: + logger.info('Waiting for \'{}\' to exist...'.format(cfg.TEST.WEIGHTS)) + time.sleep(10) + + run_inference( + cfg.TEST.WEIGHTS, + ind_range=args.range, + multi_gpu_testing=args.multi_gpu_testing, + check_expected_results=True, + ) diff --git a/configs/tools/train_net.py b/configs/tools/train_net.py new file mode 100644 index 0000000000000000000000000000000000000000..9e757b599690a9f190c2c9b9e4769abdc079a134 --- /dev/null +++ b/configs/tools/train_net.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Train a network with Detectron.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import cv2 # NOQA (Must import before importing caffe2 due to bug in cv2) +import logging +import numpy as np +import pprint +import sys + +from caffe2.python import workspace + +from detectron.core.config import assert_and_infer_cfg +from detectron.core.config import cfg +from detectron.core.config import merge_cfg_from_file +from detectron.core.config import merge_cfg_from_list +from detectron.core.test_engine import run_inference +from detectron.utils.logging import setup_logging +import detectron.utils.c2 as c2_utils +import detectron.utils.train + +c2_utils.import_contrib_ops() +c2_utils.import_detectron_ops() + +# OpenCL may be enabled by default in OpenCV3; disable it because it's not +# thread safe and causes unwanted GPU memory allocations. +cv2.ocl.setUseOpenCL(False) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Train a network with Detectron' + ) + parser.add_argument( + '--cfg', + dest='cfg_file', + help='Config file for training (and optionally testing)', + default=None, + type=str + ) + parser.add_argument( + '--multi-gpu-testing', + dest='multi_gpu_testing', + help='Use cfg.NUM_GPUS GPUs for inference', + action='store_true' + ) + parser.add_argument( + '--skip-test', + dest='skip_test', + help='Do not test the final model', + action='store_true' + ) + parser.add_argument( + 'opts', + help='See detectron/core/config.py for all options', + default=None, + nargs=argparse.REMAINDER + ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + return parser.parse_args() + + +def main(): + # Initialize C2 + workspace.GlobalInit( + ['caffe2', '--caffe2_log_level=0', '--caffe2_gpu_memory_tracking=1'] + ) + # Set up logging and load config options + logger = setup_logging(__name__) + logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO) + args = parse_args() + logger.info('Called with args:') + logger.info(args) + if args.cfg_file is not None: + merge_cfg_from_file(args.cfg_file) + if args.opts is not None: + merge_cfg_from_list(args.opts) + assert_and_infer_cfg() + smi_output, cuda_ver, cudnn_ver = c2_utils.get_nvidia_info() + logger.info("cuda version : {}".format(cuda_ver)) + logger.info("cudnn version: {}".format(cudnn_ver)) + logger.info("nvidia-smi output:\n{}".format(smi_output)) + logger.info('Training with config:') + logger.info(pprint.pformat(cfg)) + # Note that while we set the numpy random seed network training will not be + # deterministic in general. There are sources of non-determinism that cannot + # be removed with a reasonble execution-speed tradeoff (such as certain + # non-deterministic cudnn functions). + np.random.seed(cfg.RNG_SEED) + # Execute the training run + checkpoints = detectron.utils.train.train_model() + # Test the trained model + if not args.skip_test: + test_model(checkpoints['final'], args.multi_gpu_testing, args.opts) + + +def test_model(model_file, multi_gpu_testing, opts=None): + """Test a model.""" + # Clear memory before inference + workspace.ResetWorkspace() + # Run inference + run_inference( + model_file, multi_gpu_testing=multi_gpu_testing, + check_expected_results=True, + ) + + +if __name__ == '__main__': + main() diff --git a/configs/tools/visualize_results.py b/configs/tools/visualize_results.py new file mode 100644 index 0000000000000000000000000000000000000000..fc83e444ddbe17565cd04602e729bebeda6ec1ea --- /dev/null +++ b/configs/tools/visualize_results.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Script for visualizing results saved in a detections.pkl file.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import cv2 +import os +import sys + +from detectron.datasets.json_dataset import JsonDataset +from detectron.utils.io import load_object +import detectron.utils.vis as vis_utils + +# OpenCL may be enabled by default in OpenCV3; disable it because it's not +# thread safe and causes unwanted GPU memory allocations. +cv2.ocl.setUseOpenCL(False) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--dataset', + dest='dataset', + help='dataset', + default='coco_2014_minival', + type=str + ) + parser.add_argument( + '--detections', + dest='detections', + help='detections pkl file', + default='', + type=str + ) + parser.add_argument( + '--thresh', + dest='thresh', + help='detection prob threshold', + default=0.9, + type=float + ) + parser.add_argument( + '--output-dir', + dest='output_dir', + help='output directory', + default='./tmp/vis-output', + type=str + ) + parser.add_argument( + '--first', + dest='first', + help='only visualize the first k images', + default=0, + type=int + ) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + args = parser.parse_args() + return args + + +def vis(dataset, detections_pkl, thresh, output_dir, limit=0): + ds = JsonDataset(dataset) + roidb = ds.get_roidb() + + dets = load_object(detections_pkl) + + assert all(k in dets for k in ['all_boxes', 'all_segms', 'all_keyps']), \ + 'Expected detections pkl file in the format used by test_engine.py' + + all_boxes = dets['all_boxes'] + all_segms = dets['all_segms'] + all_keyps = dets['all_keyps'] + + def id_or_index(ix, val): + if len(val) == 0: + return val + else: + return val[ix] + + for ix, entry in enumerate(roidb): + if limit > 0 and ix >= limit: + break + if ix % 10 == 0: + print('{:d}/{:d}'.format(ix + 1, len(roidb))) + + im = cv2.imread(entry['image']) + im_name = os.path.splitext(os.path.basename(entry['image']))[0] + + cls_boxes_i = [ + id_or_index(ix, cls_k_boxes) for cls_k_boxes in all_boxes + ] + cls_segms_i = [ + id_or_index(ix, cls_k_segms) for cls_k_segms in all_segms + ] + cls_keyps_i = [ + id_or_index(ix, cls_k_keyps) for cls_k_keyps in all_keyps + ] + + vis_utils.vis_one_image( + im[:, :, ::-1], + '{:d}_{:s}'.format(ix, im_name), + os.path.join(output_dir, 'vis'), + cls_boxes_i, + segms=cls_segms_i, + keypoints=cls_keyps_i, + thresh=thresh, + box_alpha=0.8, + dataset=ds, + show_class=True + ) + + +if __name__ == '__main__': + opts = parse_args() + vis( + opts.dataset, + opts.detections, + opts.thresh, + opts.output_dir, + limit=opts.first + ) diff --git a/demo/15673749081_767a7fa63a_k.jpg b/demo/15673749081_767a7fa63a_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0ec90e11f64603f0a2136c142a4ba0089a8ba051 Binary files /dev/null and b/demo/15673749081_767a7fa63a_k.jpg differ diff --git a/demo/16004479832_a748d55f21_k.jpg b/demo/16004479832_a748d55f21_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5ef06804cf6bde89396c5892c8b463e1b0fd960b Binary files /dev/null and b/demo/16004479832_a748d55f21_k.jpg differ diff --git a/demo/17790319373_bd19b24cfc_k.jpg b/demo/17790319373_bd19b24cfc_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8d7ce5e33b75aa926cce67e34f82e53e43b6b4fc Binary files /dev/null and b/demo/17790319373_bd19b24cfc_k.jpg differ diff --git a/demo/18124840932_e42b3e377c_k.jpg b/demo/18124840932_e42b3e377c_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0e20882bbeb16fb6f1e7353e8d66375f678f930d Binary files /dev/null and b/demo/18124840932_e42b3e377c_k.jpg differ diff --git a/demo/19064748793_bb942deea1_k.jpg b/demo/19064748793_bb942deea1_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6269382798382a8bb1e64ae94bb6156d536a8bc0 Binary files /dev/null and b/demo/19064748793_bb942deea1_k.jpg differ diff --git a/demo/24274813513_0cfd2ce6d0_k.jpg b/demo/24274813513_0cfd2ce6d0_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2f3271a28622fa442fb594c48611374d1bd73f20 Binary files /dev/null and b/demo/24274813513_0cfd2ce6d0_k.jpg differ diff --git a/demo/33823288584_1d21cf0a26_k.jpg b/demo/33823288584_1d21cf0a26_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c218118f94f3c7469fbb7ddb4d3d162b73669660 Binary files /dev/null and b/demo/33823288584_1d21cf0a26_k.jpg differ diff --git a/demo/33887522274_eebd074106_k.jpg b/demo/33887522274_eebd074106_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3173f58d1e96a72e97b89e9461665c15f27b9acf Binary files /dev/null and b/demo/33887522274_eebd074106_k.jpg differ diff --git a/demo/34501842524_3c858b3080_k.jpg b/demo/34501842524_3c858b3080_k.jpg new file mode 100644 index 0000000000000000000000000000000000000000..26398dcde42f2fb78c7aa26fbabead880f422cce Binary files /dev/null and b/demo/34501842524_3c858b3080_k.jpg differ diff --git a/demo/NOTICE b/demo/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..506f76e8699c54694a038bc262db6394af45f584 --- /dev/null +++ b/demo/NOTICE @@ -0,0 +1,32 @@ +The demo images are licensed as United States government work: +https://www.usa.gov/government-works + +The image files were obtained on Jan 13, 2018 from the following +URLs. + +16004479832_a748d55f21_k.jpg +https://www.flickr.com/photos/archivesnews/16004479832 + +18124840932_e42b3e377c_k.jpg +https://www.flickr.com/photos/usnavy/18124840932 + +33887522274_eebd074106_k.jpg +https://www.flickr.com/photos/usaid_pakistan/33887522274 + +15673749081_767a7fa63a_k.jpg +https://www.flickr.com/photos/usnavy/15673749081 + +34501842524_3c858b3080_k.jpg +https://www.flickr.com/photos/departmentofenergy/34501842524 + +24274813513_0cfd2ce6d0_k.jpg +https://www.flickr.com/photos/dhsgov/24274813513 + +19064748793_bb942deea1_k.jpg +https://www.flickr.com/photos/statephotos/19064748793 + +33823288584_1d21cf0a26_k.jpg +https://www.flickr.com/photos/cbpphotos/33823288584 + +17790319373_bd19b24cfc_k.jpg +https://www.flickr.com/photos/secdef/17790319373 diff --git a/demo/output/17790319373_bd19b24cfc_k_example_output.jpg b/demo/output/17790319373_bd19b24cfc_k_example_output.jpg new file mode 100644 index 0000000000000000000000000000000000000000..911626d7a3863a35480f3e6718b9b5897666e79a Binary files /dev/null and b/demo/output/17790319373_bd19b24cfc_k_example_output.jpg differ diff --git a/demo/output/33823288584_1d21cf0a26_k_example_output.jpg b/demo/output/33823288584_1d21cf0a26_k_example_output.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7e806a49c84e8521c8d608a2a06c8f0dfc663f23 Binary files /dev/null and b/demo/output/33823288584_1d21cf0a26_k_example_output.jpg differ diff --git a/detectron/__init__.py b/detectron/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/detectron/core/__init__.py b/detectron/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/detectron/core/config.py b/detectron/core/config.py new file mode 100644 index 0000000000000000000000000000000000000000..7c07ec137c271058ba38ba359aeef1a5a9342c60 --- /dev/null +++ b/detectron/core/config.py @@ -0,0 +1,1303 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Detectron config system. + +This file specifies default config options for Detectron. You should not +change values in this file. Instead, you should write a config file (in yaml) +and use merge_cfg_from_file(yaml_file) to load it and override the default +options. + +Most tools in the tools directory take a --cfg option to specify an override +file and an optional list of override (key, value) pairs: + - See tools/{train,test}_net.py for example code that uses merge_cfg_from_file + - See configs/*/*.yaml for example config files + +Detectron supports a lot of different model types, each of which has a lot of +different options. The result is a HUGE set of configuration options. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from ast import literal_eval +from future.utils import iteritems +import copy +import io +import logging +import numpy as np +import os +import os.path as osp +import six + +from detectron.utils.collections import AttrDict +from detectron.utils.io import cache_url + +logger = logging.getLogger(__name__) + +__C = AttrDict() +# Consumers can get config by: +# from detectron.core.config import cfg +cfg = __C + +# Random note: avoid using '.ON' as a config key since yaml converts it to True; +# prefer 'ENABLED' instead + +# ---------------------------------------------------------------------------- # +# Training options +# ---------------------------------------------------------------------------- # +__C.TRAIN = AttrDict() + +# Initialize network with weights from this .pkl file +__C.TRAIN.WEIGHTS = '' + +# Datasets to train on +# Available dataset list: detectron.datasets.dataset_catalog.datasets() +# If multiple datasets are listed, the model is trained on their union +__C.TRAIN.DATASETS = () + +# Scales to use during training +# Each scale is the pixel size of an image's shortest side +# If multiple scales are listed, then one is selected uniformly at random for +# each training image (i.e., scale jitter data augmentation) +__C.TRAIN.SCALES = (600, ) + +# Max pixel size of the longest side of a scaled input image +__C.TRAIN.MAX_SIZE = 1000 + +# Images *per GPU* in the training minibatch +# Total images per minibatch = TRAIN.IMS_PER_BATCH * NUM_GPUS +__C.TRAIN.IMS_PER_BATCH = 2 + +# RoI minibatch size *per image* (number of regions of interest [ROIs]) +# Total number of RoIs per training minibatch = +# TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH * NUM_GPUS +# E.g., a common configuration is: 512 * 2 * 8 = 8192 +__C.TRAIN.BATCH_SIZE_PER_IM = 64 + +# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) +__C.TRAIN.FG_FRACTION = 0.25 + +# Overlap threshold for an RoI to be considered foreground (if >= FG_THRESH) +__C.TRAIN.FG_THRESH = 0.5 + +# Overlap threshold for an RoI to be considered background (class = 0 if +# overlap in [LO, HI)) +__C.TRAIN.BG_THRESH_HI = 0.5 +__C.TRAIN.BG_THRESH_LO = 0.0 + +# Use horizontally-flipped images during training? +__C.TRAIN.USE_FLIPPED = True + +# Overlap required between an RoI and a ground-truth box in order for that +# (RoI, gt box) pair to be used as a bounding-box regression training example +__C.TRAIN.BBOX_THRESH = 0.5 + +# Snapshot (model checkpoint) period +# Divide by NUM_GPUS to determine actual period (e.g., 80000/8 => 10000 iters) +# to allow for linear training schedule scaling +__C.TRAIN.SNAPSHOT_ITERS = 80000 + +# Train using these proposals +# During training, all proposals specified in the file are used (no limit is +# applied) +# Proposal files must be in correspondence with the datasets listed in +# TRAIN.DATASETS +__C.TRAIN.PROPOSAL_FILES = () + +# Make minibatches from images that have similar aspect ratios (i.e. both +# tall and thin or both short and wide) +# This feature is critical for saving memory (and makes training slightly +# faster) +__C.TRAIN.ASPECT_GROUPING = True + +# ---------------------------------------------------------------------------- # +# RPN training options +# ---------------------------------------------------------------------------- # + +# Run GenerateProposals on GPU if set to True +__C.TRAIN.GENERATE_PROPOSALS_ON_GPU = False + +# Minimum overlap required between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a positive example (IOU >= thresh ==> positive RPN +# example) +__C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 + +# Maximum overlap allowed between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a negative examples (IOU < thresh ==> negative RPN +# example) +__C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 + +# Target fraction of foreground (positive) examples per RPN minibatch +__C.TRAIN.RPN_FG_FRACTION = 0.5 + +# Total number of RPN examples per image +__C.TRAIN.RPN_BATCH_SIZE_PER_IM = 256 + +# NMS threshold used on RPN proposals (used during end-to-end training with RPN) +__C.TRAIN.RPN_NMS_THRESH = 0.7 + +# Number of top scoring RPN proposals to keep before applying NMS +# When FPN is used, this is *per FPN level* (not total) +__C.TRAIN.RPN_PRE_NMS_TOP_N = 12000 + +# Number of top scoring RPN proposals to keep after applying NMS +# This is the total number of RPN proposals produced (for both FPN and non-FPN +# cases) +__C.TRAIN.RPN_POST_NMS_TOP_N = 2000 + +# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels +# Set to -1 or a large value, e.g. 100000, to disable pruning anchors +__C.TRAIN.RPN_STRADDLE_THRESH = 0 + +# Proposal height and width both need to be greater than RPN_MIN_SIZE +# (at orig image scale; not scale used during training or inference) +__C.TRAIN.RPN_MIN_SIZE = 0 + +# Filter proposals that are inside of crowd regions by CROWD_FILTER_THRESH +# "Inside" is measured as: proposal-with-crowd intersection area divided by +# proposal area +__C.TRAIN.CROWD_FILTER_THRESH = 0.7 + +# Ignore ground-truth objects with area < this threshold +__C.TRAIN.GT_MIN_AREA = -1 + +# Freeze the backbone architecture during training if set to True +__C.TRAIN.FREEZE_CONV_BODY = False + +# Training will resume from the latest snapshot (model checkpoint) found in the +# output directory +__C.TRAIN.AUTO_RESUME = True + +# Training will copy TRAIN.WEIGHTS and treat it as a candidate checkpoint +__C.TRAIN.COPY_WEIGHTS = False + +# Add StopGrad at a specified stage so the bottom layers are frozen +__C.TRAIN.FREEZE_AT = 2 + + +# ---------------------------------------------------------------------------- # +# Data loader options (see detectron/roi_data/loader.py for more info) +# ---------------------------------------------------------------------------- # +__C.DATA_LOADER = AttrDict() + +# Number of Python threads to use for the data loader (warning: using too many +# threads can cause GIL-based interference with Python Ops leading to *slower* +# training; 4 seems to be the sweet spot in our experience) +__C.DATA_LOADER.NUM_THREADS = 4 + +# Size of the shared minibatch queue +__C.DATA_LOADER.MINIBATCH_QUEUE_SIZE = 64 + +# Capacity of the per GPU blobs queue +__C.DATA_LOADER.BLOBS_QUEUE_CAPACITY = 8 + + +# ---------------------------------------------------------------------------- # +# Inference ('test') options +# ---------------------------------------------------------------------------- # +__C.TEST = AttrDict() + +# Initialize network with weights from this .pkl file +__C.TEST.WEIGHTS = '' + +# Datasets to test on +# Available dataset list: detectron.datasets.dataset_catalog.datasets() +# If multiple datasets are listed, testing is performed on each one sequentially +__C.TEST.DATASETS = () + +# Scale to use during testing +__C.TEST.SCALE = 600 + +# Max pixel size of the longest side of a scaled input image +__C.TEST.MAX_SIZE = 1000 + +# Overlap threshold used for non-maximum suppression (suppress boxes with +# IoU >= this threshold) +__C.TEST.NMS = 0.3 + +# Apply Fast R-CNN style bounding-box regression if True +__C.TEST.BBOX_REG = True + +# Test using these proposal files (must correspond with TEST.DATASETS) +__C.TEST.PROPOSAL_FILES = () + +# Run GenerateProposals on GPU if set to True +__C.TEST.GENERATE_PROPOSALS_ON_GPU = False + +# Limit on the number of proposals per image used during inference +__C.TEST.PROPOSAL_LIMIT = 2000 + +# NMS threshold used on RPN proposals +__C.TEST.RPN_NMS_THRESH = 0.7 + +# Number of top scoring RPN proposals to keep before applying NMS +# When FPN is used, this is *per FPN level* (not total) +__C.TEST.RPN_PRE_NMS_TOP_N = 12000 + +# Number of top scoring RPN proposals to keep after applying NMS +# This is the total number of RPN proposals produced (for both FPN and non-FPN +# cases) +__C.TEST.RPN_POST_NMS_TOP_N = 2000 + +# Proposal height and width both need to be greater than RPN_MIN_SIZE +# (at orig image scale; not scale used during training or inference) +__C.TEST.RPN_MIN_SIZE = 0 + +# Maximum number of detections to return per image (100 is based on the limit +# established for the COCO dataset) +__C.TEST.DETECTIONS_PER_IM = 100 + +# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to +# balance obtaining high recall with not having too many low precision +# detections that will slow down inference post processing steps (like NMS) +__C.TEST.SCORE_THRESH = 0.05 + +# Save detection results files if True +# If false, results files are cleaned up (they can be large) after local +# evaluation +__C.TEST.COMPETITION_MODE = True + +# Evaluate detections with the COCO json dataset eval code even if it's not the +# evaluation code for the dataset (e.g. evaluate PASCAL VOC results using the +# COCO API to get COCO style AP on PASCAL VOC) +__C.TEST.FORCE_JSON_DATASET_EVAL = False + +# [Inferred value; do not set directly in a config] +# Indicates if precomputed proposals are used at test time +# Not set for 1-stage models and 2-stage models with RPN subnetwork enabled +__C.TEST.PRECOMPUTED_PROPOSALS = True + +# Evaluate proposals in class-specific Average Recall (AR). +# It means that one first computes AR within each category and then averages +# over the categories. It is not biased towards the AR of frequent categories +# compared with class-agnostic AR. +__C.TEST.CLASS_SPECIFIC_AR = False + +# ---------------------------------------------------------------------------- # +# Test-time augmentations for bounding box detection +# See configs/test_time_aug/e2e_mask_rcnn_R-50-FPN_2x.yaml for an example +# ---------------------------------------------------------------------------- # +__C.TEST.BBOX_AUG = AttrDict() + +# Enable test-time augmentation for bounding box detection if True +__C.TEST.BBOX_AUG.ENABLED = False + +# Heuristic used to combine predicted box scores +# Valid options: ('ID', 'AVG', 'UNION') +__C.TEST.BBOX_AUG.SCORE_HEUR = 'UNION' + +# Heuristic used to combine predicted box coordinates +# Valid options: ('ID', 'AVG', 'UNION') +__C.TEST.BBOX_AUG.COORD_HEUR = 'UNION' + +# Horizontal flip at the original scale (id transform) +__C.TEST.BBOX_AUG.H_FLIP = False + +# Each scale is the pixel size of an image's shortest side +__C.TEST.BBOX_AUG.SCALES = () + +# Max pixel size of the longer side +__C.TEST.BBOX_AUG.MAX_SIZE = 4000 + +# Horizontal flip at each scale +__C.TEST.BBOX_AUG.SCALE_H_FLIP = False + +# Apply scaling based on object size +__C.TEST.BBOX_AUG.SCALE_SIZE_DEP = False +__C.TEST.BBOX_AUG.AREA_TH_LO = 50**2 +__C.TEST.BBOX_AUG.AREA_TH_HI = 180**2 + +# Each aspect ratio is relative to image width +__C.TEST.BBOX_AUG.ASPECT_RATIOS = () + +# Horizontal flip at each aspect ratio +__C.TEST.BBOX_AUG.ASPECT_RATIO_H_FLIP = False + +# ---------------------------------------------------------------------------- # +# Test-time augmentations for mask detection +# See configs/test_time_aug/e2e_mask_rcnn_R-50-FPN_2x.yaml for an example +# ---------------------------------------------------------------------------- # +__C.TEST.MASK_AUG = AttrDict() + +# Enable test-time augmentation for instance mask detection if True +__C.TEST.MASK_AUG.ENABLED = False + +# Heuristic used to combine mask predictions +# SOFT prefix indicates that the computation is performed on soft masks +# Valid options: ('SOFT_AVG', 'SOFT_MAX', 'LOGIT_AVG') +__C.TEST.MASK_AUG.HEUR = 'SOFT_AVG' + +# Horizontal flip at the original scale (id transform) +__C.TEST.MASK_AUG.H_FLIP = False + +# Each scale is the pixel size of an image's shortest side +__C.TEST.MASK_AUG.SCALES = () + +# Max pixel size of the longer side +__C.TEST.MASK_AUG.MAX_SIZE = 4000 + +# Horizontal flip at each scale +__C.TEST.MASK_AUG.SCALE_H_FLIP = False + +# Apply scaling based on object size +__C.TEST.MASK_AUG.SCALE_SIZE_DEP = False +__C.TEST.MASK_AUG.AREA_TH = 180**2 + +# Each aspect ratio is relative to image width +__C.TEST.MASK_AUG.ASPECT_RATIOS = () + +# Horizontal flip at each aspect ratio +__C.TEST.MASK_AUG.ASPECT_RATIO_H_FLIP = False + +# ---------------------------------------------------------------------------- # +# Test-augmentations for keypoints detection +# configs/test_time_aug/keypoint_rcnn_R-50-FPN_1x.yaml +# ---------------------------------------------------------------------------- # +__C.TEST.KPS_AUG = AttrDict() + +# Enable test-time augmentation for keypoint detection if True +__C.TEST.KPS_AUG.ENABLED = False + +# Heuristic used to combine keypoint predictions +# Valid options: ('HM_AVG', 'HM_MAX') +__C.TEST.KPS_AUG.HEUR = 'HM_AVG' + +# Horizontal flip at the original scale (id transform) +__C.TEST.KPS_AUG.H_FLIP = False + +# Each scale is the pixel size of an image's shortest side +__C.TEST.KPS_AUG.SCALES = () + +# Max pixel size of the longer side +__C.TEST.KPS_AUG.MAX_SIZE = 4000 + +# Horizontal flip at each scale +__C.TEST.KPS_AUG.SCALE_H_FLIP = False + +# Apply scaling based on object size +__C.TEST.KPS_AUG.SCALE_SIZE_DEP = False +__C.TEST.KPS_AUG.AREA_TH = 180**2 + +# Eeach aspect ratio is realtive to image width +__C.TEST.KPS_AUG.ASPECT_RATIOS = () + +# Horizontal flip at each aspect ratio +__C.TEST.KPS_AUG.ASPECT_RATIO_H_FLIP = False + +# ---------------------------------------------------------------------------- # +# Soft NMS +# ---------------------------------------------------------------------------- # +__C.TEST.SOFT_NMS = AttrDict() + +# Use soft NMS instead of standard NMS if set to True +__C.TEST.SOFT_NMS.ENABLED = False +# See soft NMS paper for definition of these options +__C.TEST.SOFT_NMS.METHOD = 'linear' +__C.TEST.SOFT_NMS.SIGMA = 0.5 +# For the soft NMS overlap threshold, we simply use TEST.NMS + +# ---------------------------------------------------------------------------- # +# Bounding box voting (from the Multi-Region CNN paper) +# ---------------------------------------------------------------------------- # +__C.TEST.BBOX_VOTE = AttrDict() + +# Use box voting if set to True +__C.TEST.BBOX_VOTE.ENABLED = False + +# We use TEST.NMS threshold for the NMS step. VOTE_TH overlap threshold +# is used to select voting boxes (IoU >= VOTE_TH) for each box that survives NMS +__C.TEST.BBOX_VOTE.VOTE_TH = 0.8 + +# The method used to combine scores when doing bounding box voting +# Valid options include ('ID', 'AVG', 'IOU_AVG', 'GENERALIZED_AVG', 'QUASI_SUM') +__C.TEST.BBOX_VOTE.SCORING_METHOD = 'ID' + +# Hyperparameter used by the scoring method (it has different meanings for +# different methods) +__C.TEST.BBOX_VOTE.SCORING_METHOD_BETA = 1.0 + + +# ---------------------------------------------------------------------------- # +# Model options +# ---------------------------------------------------------------------------- # +__C.MODEL = AttrDict() + +# The type of model to use +# The string must match a function in the modeling.model_builder module +# (e.g., 'generalized_rcnn', 'mask_rcnn', ...) +__C.MODEL.TYPE = '' + +# The backbone conv body to use +# The string must match a function that is imported in modeling.model_builder +# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN +# backbone) +__C.MODEL.CONV_BODY = '' + +# Number of classes in the dataset; must be set +# E.g., 81 for COCO (80 foreground + 1 background) +__C.MODEL.NUM_CLASSES = -1 + +# Use a class agnostic bounding box regressor instead of the default per-class +# regressor +__C.MODEL.CLS_AGNOSTIC_BBOX_REG = False + +# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets +# These are empirically chosen to approximately lead to unit variance targets +__C.MODEL.BBOX_REG_WEIGHTS = (10., 10., 5., 5.) + +# The meaning of FASTER_RCNN depends on the context (training vs. inference): +# 1) During training, FASTER_RCNN = True means that end-to-end training will be +# used to jointly train the RPN subnetwork and the Fast R-CNN subnetwork +# (Faster R-CNN = RPN + Fast R-CNN). +# 2) During inference, FASTER_RCNN = True means that the model's RPN subnetwork +# will be used to generate proposals rather than relying on precomputed +# proposals. Note that FASTER_RCNN = True can be used at inference time even +# if the Faster R-CNN model was trained with stagewise training (which +# consists of alternating between RPN and Fast R-CNN training in a way that +# finally leads to a single network). +__C.MODEL.FASTER_RCNN = False + +# Indicates the model makes instance mask predictions (as in Mask R-CNN) +__C.MODEL.MASK_ON = False + +# Indicates the model makes keypoint predictions (as in Mask R-CNN for +# keypoints) +__C.MODEL.KEYPOINTS_ON = False + +# Indicates the model's computation terminates with the production of RPN +# proposals (i.e., it outputs proposals ONLY, no actual object detections) +__C.MODEL.RPN_ONLY = False + +# Caffe2 net execution type +# Use 'prof_dag' to get profiling statistics +__C.MODEL.EXECUTION_TYPE = 'dag' + + +# ---------------------------------------------------------------------------- # +# RetinaNet options +# ---------------------------------------------------------------------------- # +__C.RETINANET = AttrDict() + +# RetinaNet is used (instead of Fast/er/Mask R-CNN/R-FCN/RPN) if True +__C.RETINANET.RETINANET_ON = False + +# Anchor aspect ratios to use +__C.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0) + +# Anchor scales per octave +__C.RETINANET.SCALES_PER_OCTAVE = 3 + +# At each FPN level, we generate anchors based on their scale, aspect_ratio, +# stride of the level, and we multiply the resulting anchor by ANCHOR_SCALE +__C.RETINANET.ANCHOR_SCALE = 4 + +# Convolutions to use in the cls and bbox tower +# NOTE: this doesn't include the last conv for logits +__C.RETINANET.NUM_CONVS = 4 + +# Weight for bbox_regression loss +__C.RETINANET.BBOX_REG_WEIGHT = 1.0 + +# Smooth L1 loss beta for bbox regression +__C.RETINANET.BBOX_REG_BETA = 0.11 + +# During inference, #locs to select based on cls score before NMS is performed +# per FPN level +__C.RETINANET.PRE_NMS_TOP_N = 1000 + +# IoU overlap ratio for labeling an anchor as positive +# Anchors with >= iou overlap are labeled positive +__C.RETINANET.POSITIVE_OVERLAP = 0.5 + +# IoU overlap ratio for labeling an anchor as negative +# Anchors with < iou overlap are labeled negative +__C.RETINANET.NEGATIVE_OVERLAP = 0.4 + +# Focal loss parameter: alpha +__C.RETINANET.LOSS_ALPHA = 0.25 + +# Focal loss parameter: gamma +__C.RETINANET.LOSS_GAMMA = 2.0 + +# Prior prob for the positives at the beginning of training. This is used to set +# the bias init for the logits layer +__C.RETINANET.PRIOR_PROB = 0.01 + +# Whether classification and bbox branch tower should be shared or not +__C.RETINANET.SHARE_CLS_BBOX_TOWER = False + +# Use class specific bounding box regression instead of the default class +# agnostic regression +__C.RETINANET.CLASS_SPECIFIC_BBOX = False + +# Whether softmax should be used in classification branch training +__C.RETINANET.SOFTMAX = False + +# Inference cls score threshold, anchors with score > INFERENCE_TH are +# considered for inference +__C.RETINANET.INFERENCE_TH = 0.05 + + +# ---------------------------------------------------------------------------- # +# Solver options +# Note: all solver options are used exactly as specified; the implication is +# that if you switch from training on 1 GPU to N GPUs, you MUST adjust the +# solver configuration accordingly. We suggest using gradual warmup and the +# linear learning rate scaling rule as described in +# "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour" Goyal et al. +# https://arxiv.org/abs/1706.02677 +# ---------------------------------------------------------------------------- # +__C.SOLVER = AttrDict() + +# Base learning rate for the specified schedule +__C.SOLVER.BASE_LR = 0.001 + +# Schedule type (see functions in utils.lr_policy for options) +# E.g., 'step', 'steps_with_decay', ... +__C.SOLVER.LR_POLICY = 'step' + +# Some LR Policies (by example): +# 'step' +# lr = SOLVER.BASE_LR * SOLVER.GAMMA ** (cur_iter // SOLVER.STEP_SIZE) +# 'steps_with_decay' +# SOLVER.STEPS = [0, 60000, 80000] +# SOLVER.GAMMA = 0.1 +# lr = SOLVER.BASE_LR * SOLVER.GAMMA ** current_step +# iters [0, 59999] are in current_step = 0, iters [60000, 79999] are in +# current_step = 1, and so on +# 'steps_with_lrs' +# SOLVER.STEPS = [0, 60000, 80000] +# SOLVER.LRS = [0.02, 0.002, 0.0002] +# lr = LRS[current_step] +# 'cosine_decay' +# lr = SOLVER.BASE_LR * (cos(PI * cur_iter / SOLVER.MAX_ITER) * 0.5 + 0.5) +# 'exp_decay' +# lr smoothly decays from SOLVER.BASE_LR to SOLVER.GAMMA * SOLVER.BASE_LR +# lr = SOLVER.BASE_LR * exp(np.log(SOLVER.GAMMA) * cur_iter / SOLVER.MAX_ITER) + +# Hyperparameter used by the specified policy +# For 'step', the current LR is multiplied by SOLVER.GAMMA at each step +# For 'exp_decay', SOLVER.GAMMA is the ratio between the final and initial LR. +__C.SOLVER.GAMMA = 0.1 + +# Uniform step size for 'steps' policy +__C.SOLVER.STEP_SIZE = 30000 + +# Non-uniform step iterations for 'steps_with_decay' or 'steps_with_lrs' +# policies +__C.SOLVER.STEPS = [] + +# Learning rates to use with 'steps_with_lrs' policy +__C.SOLVER.LRS = [] + +# Maximum number of SGD iterations +__C.SOLVER.MAX_ITER = 40000 + +# Momentum to use with SGD +__C.SOLVER.MOMENTUM = 0.9 + +# L2 regularization hyperparameter +__C.SOLVER.WEIGHT_DECAY = 0.0005 +# L2 regularization hyperparameter for GroupNorm's parameters +__C.SOLVER.WEIGHT_DECAY_GN = 0.0 + +# Warm up to SOLVER.BASE_LR over this number of SGD iterations +__C.SOLVER.WARM_UP_ITERS = 500 + +# Start the warm up from SOLVER.BASE_LR * SOLVER.WARM_UP_FACTOR +__C.SOLVER.WARM_UP_FACTOR = 1.0 / 3.0 + +# WARM_UP_METHOD can be either 'constant' or 'linear' (i.e., gradual) +__C.SOLVER.WARM_UP_METHOD = 'linear' + +# Scale the momentum update history by new_lr / old_lr when updating the +# learning rate (this is correct given MomentumSGDUpdateOp) +__C.SOLVER.SCALE_MOMENTUM = True +# Only apply the correction if the relative LR change exceeds this threshold +# (prevents ever change in linear warm up from scaling the momentum by a tiny +# amount; momentum scaling is only important if the LR change is large) +__C.SOLVER.SCALE_MOMENTUM_THRESHOLD = 1.1 + +# Suppress logging of changes to LR unless the relative change exceeds this +# threshold (prevents linear warm up from spamming the training log) +__C.SOLVER.LOG_LR_CHANGE_THRESHOLD = 1.1 + + +# ---------------------------------------------------------------------------- # +# Fast R-CNN options +# ---------------------------------------------------------------------------- # +__C.FAST_RCNN = AttrDict() + +# The type of RoI head to use for bounding box classification and regression +# The string must match a function this is imported in modeling.model_builder +# (e.g., 'head_builder.add_roi_2mlp_head' to specify a two hidden layer MLP) +__C.FAST_RCNN.ROI_BOX_HEAD = '' + +# Hidden layer dimension when using an MLP for the RoI box head +__C.FAST_RCNN.MLP_HEAD_DIM = 1024 + +# Hidden Conv layer dimension when using Convs for the RoI box head +__C.FAST_RCNN.CONV_HEAD_DIM = 256 +# Number of stacked Conv layers in the RoI box head +__C.FAST_RCNN.NUM_STACKED_CONVS = 4 + +# RoI transformation function (e.g., RoIPool or RoIAlign) +# (RoIPoolF is the same as RoIPool; ignore the trailing 'F') +__C.FAST_RCNN.ROI_XFORM_METHOD = 'RoIPoolF' + +# Number of grid sampling points in RoIAlign (usually use 2) +# Only applies to RoIAlign +__C.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO = 0 + +# RoI transform output resolution +# Note: some models may have constraints on what they can use, e.g. they use +# pretrained FC layers like in VGG16, and will ignore this option +__C.FAST_RCNN.ROI_XFORM_RESOLUTION = 14 + + +# ---------------------------------------------------------------------------- # +# RPN options +# ---------------------------------------------------------------------------- # +__C.RPN = AttrDict() + +# [Infered value; do not set directly in a config] +# Indicates that the model contains an RPN subnetwork +__C.RPN.RPN_ON = False + +# RPN anchor sizes given in absolute pixels w.r.t. the scaled network input +# Note: these options are *not* used by FPN RPN; see FPN.RPN* options +__C.RPN.SIZES = (64, 128, 256, 512) + +# Stride of the feature map that RPN is attached +__C.RPN.STRIDE = 16 + +# RPN anchor aspect ratios +__C.RPN.ASPECT_RATIOS = (0.5, 1, 2) + + +# ---------------------------------------------------------------------------- # +# FPN options +# ---------------------------------------------------------------------------- # +__C.FPN = AttrDict() + +# FPN is enabled if True +__C.FPN.FPN_ON = False + +# Channel dimension of the FPN feature levels +__C.FPN.DIM = 256 + +# Initialize the lateral connections to output zero if True +__C.FPN.ZERO_INIT_LATERAL = False + +# Stride of the coarsest FPN level +# This is needed so the input can be padded properly +__C.FPN.COARSEST_STRIDE = 32 + +# +# FPN may be used for just RPN, just object detection, or both +# + +# Use FPN for RoI transform for object detection if True +__C.FPN.MULTILEVEL_ROIS = False +# Hyperparameters for the RoI-to-FPN level mapping heuristic +__C.FPN.ROI_CANONICAL_SCALE = 224 # s0 +__C.FPN.ROI_CANONICAL_LEVEL = 4 # k0: where s0 maps to +# Coarsest level of the FPN pyramid +__C.FPN.ROI_MAX_LEVEL = 5 +# Finest level of the FPN pyramid +__C.FPN.ROI_MIN_LEVEL = 2 + +# Use FPN for RPN if True +__C.FPN.MULTILEVEL_RPN = False +# Coarsest level of the FPN pyramid +__C.FPN.RPN_MAX_LEVEL = 6 +# Finest level of the FPN pyramid +__C.FPN.RPN_MIN_LEVEL = 2 +# FPN RPN anchor aspect ratios +__C.FPN.RPN_ASPECT_RATIOS = (0.5, 1, 2) +# RPN anchors start at this size on RPN_MIN_LEVEL +# The anchor size doubled each level after that +# With a default of 32 and levels 2 to 6, we get anchor sizes of 32 to 512 +__C.FPN.RPN_ANCHOR_START_SIZE = 32 +# Use extra FPN levels, as done in the RetinaNet paper +__C.FPN.EXTRA_CONV_LEVELS = False +# Use GroupNorm in the FPN-specific layers (lateral, etc.) +__C.FPN.USE_GN = False + + +# ---------------------------------------------------------------------------- # +# Mask R-CNN options ("MRCNN" means Mask R-CNN) +# ---------------------------------------------------------------------------- # +__C.MRCNN = AttrDict() + +# The type of RoI head to use for instance mask prediction +# The string must match a function this is imported in modeling.model_builder +# (e.g., 'mask_rcnn_heads.ResNet_mask_rcnn_fcn_head_v1up4convs') +__C.MRCNN.ROI_MASK_HEAD = '' + +# Resolution of mask predictions +__C.MRCNN.RESOLUTION = 14 + +# RoI transformation function and associated options +__C.MRCNN.ROI_XFORM_METHOD = 'RoIAlign' + +# RoI transformation function (e.g., RoIPool or RoIAlign) +__C.MRCNN.ROI_XFORM_RESOLUTION = 7 + +# Number of grid sampling points in RoIAlign (usually use 2) +# Only applies to RoIAlign +__C.MRCNN.ROI_XFORM_SAMPLING_RATIO = 0 + +# Number of channels in the mask head +__C.MRCNN.DIM_REDUCED = 256 + +# Use dilated convolution in the mask head +__C.MRCNN.DILATION = 2 + +# Upsample the predicted masks by this factor +__C.MRCNN.UPSAMPLE_RATIO = 1 + +# Use a fully-connected layer to predict the final masks instead of a conv layer +__C.MRCNN.USE_FC_OUTPUT = False + +# Weight initialization method for the mask head and mask output layers +__C.MRCNN.CONV_INIT = 'GaussianFill' + +# Use class specific mask predictions if True (otherwise use class agnostic mask +# predictions) +__C.MRCNN.CLS_SPECIFIC_MASK = True + +# Multi-task loss weight for masks +__C.MRCNN.WEIGHT_LOSS_MASK = 1.0 + +# Binarization threshold for converting soft masks to hard masks +__C.MRCNN.THRESH_BINARIZE = 0.5 + + +# ---------------------------------------------------------------------------- # +# Keypoint Mask R-CNN options ("KRCNN" = Mask R-CNN with Keypoint support) +# ---------------------------------------------------------------------------- # +__C.KRCNN = AttrDict() + +# The type of RoI head to use for instance keypoint prediction +# The string must match a function this is imported in modeling.model_builder +# (e.g., 'keypoint_rcnn_heads.add_roi_pose_head_v1convX') +__C.KRCNN.ROI_KEYPOINTS_HEAD = '' + +# Output size (and size loss is computed on), e.g., 56x56 +__C.KRCNN.HEATMAP_SIZE = -1 + +# Use bilinear interpolation to upsample the final heatmap by this factor +__C.KRCNN.UP_SCALE = -1 + +# Apply a ConvTranspose layer to the hidden representation computed by the +# keypoint head prior to predicting the per-keypoint heatmaps +__C.KRCNN.USE_DECONV = False +# Channel dimension of the hidden representation produced by the ConvTranspose +__C.KRCNN.DECONV_DIM = 256 + +# Use a ConvTranspose layer to predict the per-keypoint heatmaps +__C.KRCNN.USE_DECONV_OUTPUT = False + +# Use dilation in the keypoint head +__C.KRCNN.DILATION = 1 + +# Size of the kernels to use in all ConvTranspose operations +__C.KRCNN.DECONV_KERNEL = 4 + +# Number of keypoints in the dataset (e.g., 17 for COCO) +__C.KRCNN.NUM_KEYPOINTS = -1 + +# Number of stacked Conv layers in keypoint head +__C.KRCNN.NUM_STACKED_CONVS = 8 + +# Dimension of the hidden representation output by the keypoint head +__C.KRCNN.CONV_HEAD_DIM = 256 + +# Conv kernel size used in the keypoint head +__C.KRCNN.CONV_HEAD_KERNEL = 3 +# Conv kernel weight filling function +__C.KRCNN.CONV_INIT = 'GaussianFill' + +# Use NMS based on OKS if True +__C.KRCNN.NMS_OKS = False + +# Source of keypoint confidence +# Valid options: ('bbox', 'logit', 'prob') +__C.KRCNN.KEYPOINT_CONFIDENCE = 'bbox' + +# Standard ROI XFORM options (see FAST_RCNN or MRCNN options) +__C.KRCNN.ROI_XFORM_METHOD = 'RoIAlign' +__C.KRCNN.ROI_XFORM_RESOLUTION = 7 +__C.KRCNN.ROI_XFORM_SAMPLING_RATIO = 0 + +# Minimum number of labeled keypoints that must exist in a minibatch (otherwise +# the minibatch is discarded) +__C.KRCNN.MIN_KEYPOINT_COUNT_FOR_VALID_MINIBATCH = 20 + +# When infering the keypoint locations from the heatmap, don't scale the heatmap +# below this minimum size +__C.KRCNN.INFERENCE_MIN_SIZE = 0 + +# Multi-task loss weight to use for keypoints +# Recommended values: +# - use 1.0 if KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS is True +# - use 4.0 if KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS is False +__C.KRCNN.LOSS_WEIGHT = 1.0 + +# Normalize by the total number of visible keypoints in the minibatch if True. +# Otherwise, normalize by the total number of keypoints that could ever exist +# in the minibatch. See comments in modeling.model_builder.add_keypoint_losses +# for detailed discussion. +__C.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS = True + + +# ---------------------------------------------------------------------------- # +# R-FCN options +# ---------------------------------------------------------------------------- # +__C.RFCN = AttrDict() + +# Position-sensitive RoI pooling output grid size (height and width) +__C.RFCN.PS_GRID_SIZE = 3 + + +# ---------------------------------------------------------------------------- # +# ResNets options ("ResNets" = ResNet and ResNeXt) +# ---------------------------------------------------------------------------- # +__C.RESNETS = AttrDict() + +# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt +__C.RESNETS.NUM_GROUPS = 1 + +# Baseline width of each group +__C.RESNETS.WIDTH_PER_GROUP = 64 + +# Place the stride 2 conv on the 1x1 filter +# Use True only for the original MSRA ResNet; use False for C2 and Torch models +__C.RESNETS.STRIDE_1X1 = True + +# Residual transformation function +__C.RESNETS.TRANS_FUNC = 'bottleneck_transformation' +# ResNet's stem function (conv1 and pool1) +__C.RESNETS.STEM_FUNC = 'basic_bn_stem' +# ResNet's shortcut function +__C.RESNETS.SHORTCUT_FUNC = 'basic_bn_shortcut' + +# Apply dilation in stage "res5" +__C.RESNETS.RES5_DILATION = 1 + + +# ---------------------------------------------------------------------------- # +# GroupNorm options +# ---------------------------------------------------------------------------- # +__C.GROUP_NORM = AttrDict() +# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS) +__C.GROUP_NORM.DIM_PER_GP = -1 +# Number of groups in GroupNorm (-1 if using DIM_PER_GP) +__C.GROUP_NORM.NUM_GROUPS = 32 +# GroupNorm's small constant in the denominator +__C.GROUP_NORM.EPSILON = 1e-5 + + +# ---------------------------------------------------------------------------- # +# Misc options +# ---------------------------------------------------------------------------- # + +# Number of GPUs to use (applies to both training and testing) +__C.NUM_GPUS = 1 + +# Use NCCL for all reduce, otherwise use muji +# Warning: if set to True, you may experience deadlocks +__C.USE_NCCL = False + +# The mapping from image coordinates to feature map coordinates might cause +# some boxes that are distinct in image space to become identical in feature +# coordinates. If DEDUP_BOXES > 0, then DEDUP_BOXES is used as the scale factor +# for identifying duplicate boxes. +# 1/16 is correct for {Alex,Caffe}Net, VGG_CNN_M_1024, and VGG16 +__C.DEDUP_BOXES = 1 / 16. + +# Clip bounding box transformation predictions to prevent np.exp from +# overflowing +# Heuristic choice based on that would scale a 16 pixel anchor up to 1000 pixels +__C.BBOX_XFORM_CLIP = np.log(1000. / 16.) + +# Pixel mean values (BGR order) as a (1, 1, 3) array +# We use the same pixel mean for all networks even though it's not exactly what +# they were trained with +# "Fun" fact: the history of where these values comes from is lost +__C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]]) + +# For reproducibility...but not really because modern fast GPU libraries use +# non-deterministic op implementations +__C.RNG_SEED = 3 + +# A small number that's used many times +__C.EPS = 1e-14 + +# Root directory of project +__C.ROOT_DIR = os.getcwd() + +# Output basedir +__C.OUTPUT_DIR = '/tmp' + +# Name (or path to) the matlab executable +__C.MATLAB = 'matlab' + +# Reduce memory usage with memonger gradient blob sharing +__C.MEMONGER = True + +# Futher reduce memory by allowing forward pass activations to be shared when +# possible. Note that this will cause activation blob inspection (values, +# shapes, etc.) to be meaningless when activation blobs are reused. +__C.MEMONGER_SHARE_ACTIVATIONS = False + +# Dump detection visualizations +__C.VIS = False + +# Score threshold for visualization +__C.VIS_TH = 0.9 + +# Expected results should take the form of a list of expectations, each +# specified by four elements (dataset, task, metric, expected value). For +# example: [['coco_2014_minival', 'box_proposal', 'AR@1000', 0.387]] +__C.EXPECTED_RESULTS = [] +# Absolute and relative tolerance to use when comparing to EXPECTED_RESULTS +__C.EXPECTED_RESULTS_RTOL = 0.1 +__C.EXPECTED_RESULTS_ATOL = 0.005 +# When the expected value specifies a mean and standard deviation, we check +# that the actual value is within mean +/- SIGMA_TOL * std +__C.EXPECTED_RESULTS_SIGMA_TOL = 4 +# Set to send email in case of an EXPECTED_RESULTS failure +__C.EXPECTED_RESULTS_EMAIL = '' + +# Models and proposals referred to by URL are downloaded to a local cache +# specified by DOWNLOAD_CACHE +__C.DOWNLOAD_CACHE = '/tmp/detectron-download-cache' + + +# ---------------------------------------------------------------------------- # +# Cluster options +# ---------------------------------------------------------------------------- # +__C.CLUSTER = AttrDict() + +# Flag to indicate if the code is running in a cluster environment +__C.CLUSTER.ON_CLUSTER = False + + +# ---------------------------------------------------------------------------- # +# Deprecated options +# If an option is removed from the code and you don't want to break existing +# yaml configs, you can add the full config key as a string to the set below. +# ---------------------------------------------------------------------------- # +_DEPRECATED_KEYS = set( + { + 'FINAL_MSG', + 'MODEL.DILATION', + 'ROOT_GPU_ID', + 'RPN.ON', + 'TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED', + 'TRAIN.DROPOUT', + 'USE_GPU_NMS', + 'TEST.NUM_TEST_IMAGES', + } +) + + +# ---------------------------------------------------------------------------- # +# Renamed options +# If you rename a config option, record the mapping from the old name to the new +# name in the dictionary below. Optionally, if the type also changed, you can +# make the value a tuple that specifies first the renamed key and then +# instructions for how to edit the config file. +# ---------------------------------------------------------------------------- # +_RENAMED_KEYS = { + 'EXAMPLE.RENAMED.KEY': 'EXAMPLE.KEY', # Dummy example to follow + 'MODEL.PS_GRID_SIZE': 'RFCN.PS_GRID_SIZE', + 'MODEL.ROI_HEAD': 'FAST_RCNN.ROI_BOX_HEAD', + 'MRCNN.MASK_HEAD_NAME': 'MRCNN.ROI_MASK_HEAD', + 'TRAIN.DATASET': ( + 'TRAIN.DATASETS', + "Also convert to a tuple, e.g., " + + "'coco_2014_train' -> ('coco_2014_train',) or " + + "'coco_2014_train:coco_2014_valminusminival' -> " + + "('coco_2014_train', 'coco_2014_valminusminival')" + ), + 'TRAIN.PROPOSAL_FILE': ( + 'TRAIN.PROPOSAL_FILES', + "Also convert to a tuple, e.g., " + + "'path/to/file' -> ('path/to/file',) or " + + "'path/to/file1:path/to/file2' -> " + + "('path/to/file1', 'path/to/file2')" + ), + 'TEST.SCALES': ( + 'TEST.SCALE', + "Also convert from a tuple, e.g. (600, ), " + + "to a integer, e.g. 600." + ), + 'TEST.DATASET': ( + 'TEST.DATASETS', + "Also convert from a string, e.g 'coco_2014_minival', " + + "to a tuple, e.g. ('coco_2014_minival', )." + ), + 'TEST.PROPOSAL_FILE': ( + 'TEST.PROPOSAL_FILES', + "Also convert from a string, e.g. '/path/to/props.pkl', " + + "to a tuple, e.g. ('/path/to/props.pkl', )." + ), +} + + +# ---------------------------------------------------------------------------- # +# Renamed modules +# If a module containing a data structure used in the config (e.g. AttrDict) +# is renamed/moved and you don't want to break loading of existing yaml configs +# (e.g. from weights files) you can specify the renamed module below. +# ---------------------------------------------------------------------------- # +_RENAMED_MODULES = { + 'utils.collections': 'detectron.utils.collections', +} + + +def assert_and_infer_cfg(cache_urls=True, make_immutable=True): + """Call this function in your script after you have finished setting all cfg + values that are necessary (e.g., merging a config from a file, merging + command line config options, etc.). By default, this function will also + mark the global cfg as immutable to prevent changing the global cfg settings + during script execution (which can lead to hard to debug errors or code + that's harder to understand than is necessary). + """ + if __C.MODEL.RPN_ONLY or __C.MODEL.FASTER_RCNN: + __C.RPN.RPN_ON = True + if __C.RPN.RPN_ON or __C.RETINANET.RETINANET_ON: + __C.TEST.PRECOMPUTED_PROPOSALS = False + if cache_urls: + cache_cfg_urls() + if make_immutable: + cfg.immutable(True) + + +def cache_cfg_urls(): + """Download URLs in the config, cache them locally, and rewrite cfg to make + use of the locally cached file. + """ + __C.TRAIN.WEIGHTS = cache_url(__C.TRAIN.WEIGHTS, __C.DOWNLOAD_CACHE) + __C.TEST.WEIGHTS = cache_url(__C.TEST.WEIGHTS, __C.DOWNLOAD_CACHE) + __C.TRAIN.PROPOSAL_FILES = tuple( + cache_url(f, __C.DOWNLOAD_CACHE) for f in __C.TRAIN.PROPOSAL_FILES + ) + __C.TEST.PROPOSAL_FILES = tuple( + cache_url(f, __C.DOWNLOAD_CACHE) for f in __C.TEST.PROPOSAL_FILES + ) + + +def get_output_dir(datasets, training=True): + """Get the output directory determined by the current global config.""" + assert isinstance(datasets, tuple([tuple, list] + list(six.string_types))), \ + 'datasets argument must be of type tuple, list or string' + is_string = isinstance(datasets, six.string_types) + dataset_name = datasets if is_string else ':'.join(datasets) + tag = 'train' if training else 'test' + # //// + outdir = osp.join(__C.OUTPUT_DIR, tag, dataset_name, __C.MODEL.TYPE) + if not osp.exists(outdir): + os.makedirs(outdir) + return outdir + + +def load_cfg(cfg_to_load): + """Wrapper around yaml.load used for maintaining backward compatibility""" + file_types = [file, io.IOBase] if six.PY2 else [io.IOBase] # noqa false positive + expected_types = tuple(file_types + list(six.string_types)) + assert isinstance(cfg_to_load, expected_types), \ + 'Expected one of {}, got {}'.format(expected_types, type(cfg_to_load)) + if isinstance(cfg_to_load, tuple(file_types)): + cfg_to_load = ''.join(cfg_to_load.readlines()) + for old_module, new_module in iteritems(_RENAMED_MODULES): + # yaml object encoding: !!python/object/new:. + old_module, new_module = 'new:' + old_module, 'new:' + new_module + cfg_to_load = cfg_to_load.replace(old_module, new_module) + # Import inline due to a circular dependency between env.py and config.py + import detectron.utils.env as envu + return envu.yaml_load(cfg_to_load) + + +def merge_cfg_from_file(cfg_filename): + """Load a yaml config file and merge it into the global config.""" + with open(cfg_filename, 'r') as f: + yaml_cfg = AttrDict(load_cfg(f)) + _merge_a_into_b(yaml_cfg, __C) + + +def merge_cfg_from_cfg(cfg_other): + """Merge `cfg_other` into the global config.""" + _merge_a_into_b(cfg_other, __C) + + +def merge_cfg_from_list(cfg_list): + """Merge config keys, values in a list (e.g., from command line) into the + global config. For example, `cfg_list = ['TEST.NMS', 0.5]`. + """ + assert len(cfg_list) % 2 == 0 + for full_key, v in zip(cfg_list[0::2], cfg_list[1::2]): + if _key_is_deprecated(full_key): + continue + if _key_is_renamed(full_key): + _raise_key_rename_error(full_key) + key_list = full_key.split('.') + d = __C + for subkey in key_list[:-1]: + assert subkey in d, 'Non-existent key: {}'.format(full_key) + d = d[subkey] + subkey = key_list[-1] + assert subkey in d, 'Non-existent key: {}'.format(full_key) + value = _decode_cfg_value(v) + value = _check_and_coerce_cfg_value_type( + value, d[subkey], subkey, full_key + ) + d[subkey] = value + + +def _merge_a_into_b(a, b, stack=None): + """Merge config dictionary a into config dictionary b, clobbering the + options in b whenever they are also specified in a. + """ + assert isinstance(a, AttrDict), \ + '`a` (cur type {}) must be an instance of {}'.format(type(a), AttrDict) + assert isinstance(b, AttrDict), \ + '`b` (cur type {}) must be an instance of {}'.format(type(b), AttrDict) + + for k, v_ in a.items(): + full_key = '.'.join(stack) + '.' + k if stack is not None else k + # a must specify keys that are in b + if k not in b: + if _key_is_deprecated(full_key): + continue + elif _key_is_renamed(full_key): + _raise_key_rename_error(full_key) + else: + raise KeyError('Non-existent config key: {}'.format(full_key)) + + v = copy.deepcopy(v_) + v = _decode_cfg_value(v) + v = _check_and_coerce_cfg_value_type(v, b[k], k, full_key) + + # Recursively merge dicts + if isinstance(v, AttrDict): + try: + stack_push = [k] if stack is None else stack + [k] + _merge_a_into_b(v, b[k], stack=stack_push) + except BaseException: + raise + else: + b[k] = v + + +def _key_is_deprecated(full_key): + if full_key in _DEPRECATED_KEYS: + logger.warn( + 'Deprecated config key (ignoring): {}'.format(full_key) + ) + return True + return False + + +def _key_is_renamed(full_key): + return full_key in _RENAMED_KEYS + + +def _raise_key_rename_error(full_key): + new_key = _RENAMED_KEYS[full_key] + if isinstance(new_key, tuple): + msg = ' Note: ' + new_key[1] + new_key = new_key[0] + else: + msg = '' + raise KeyError( + 'Key {} was renamed to {}; please update your config.{}'. + format(full_key, new_key, msg) + ) + + +def _decode_cfg_value(v): + """Decodes a raw config value (e.g., from a yaml config files or command + line argument) into a Python object. + """ + # Configs parsed from raw yaml will contain dictionary keys that need to be + # converted to AttrDict objects + if isinstance(v, dict): + return AttrDict(v) + # All remaining processing is only applied to strings + if not isinstance(v, six.string_types): + return v + # Try to interpret `v` as a: + # string, number, tuple, list, dict, boolean, or None + try: + v = literal_eval(v) + # The following two excepts allow v to pass through when it represents a + # string. + # + # Longer explanation: + # The type of v is always a string (before calling literal_eval), but + # sometimes it *represents* a string and other times a data structure, like + # a list. In the case that v represents a string, what we got back from the + # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is + # ok with '"foo"', but will raise a ValueError if given 'foo'. In other + # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval + # will raise a SyntaxError. + except ValueError: + pass + except SyntaxError: + pass + return v + + +def _check_and_coerce_cfg_value_type(value_a, value_b, key, full_key): + """Checks that `value_a`, which is intended to replace `value_b` is of the + right type. The type is correct if it matches exactly or is one of a few + cases in which the type can be easily coerced. + """ + # The types must match (with some exceptions) + type_b = type(value_b) + type_a = type(value_a) + if type_a is type_b: + return value_a + + # Exceptions: numpy arrays, strings, tuple<->list + if isinstance(value_b, np.ndarray): + value_a = np.array(value_a, dtype=value_b.dtype) + elif isinstance(value_b, six.string_types): + value_a = str(value_a) + elif isinstance(value_a, tuple) and isinstance(value_b, list): + value_a = list(value_a) + elif isinstance(value_a, list) and isinstance(value_b, tuple): + value_a = tuple(value_a) + else: + raise ValueError( + 'Type mismatch ({} vs. {}) with values ({} vs. {}) for config ' + 'key: {}'.format(type_b, type_a, value_b, value_a, full_key) + ) + return value_a diff --git a/detectron/core/rpn_generator.py b/detectron/core/rpn_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..d15819f68c354b618cf3e13b9055d247d8aa39fb --- /dev/null +++ b/detectron/core/rpn_generator.py @@ -0,0 +1,279 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Functions for RPN proposal generation.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import datetime +import logging +import numpy as np +import os + +from caffe2.python import core +from caffe2.python import workspace + +from detectron.core.config import cfg +from detectron.datasets import task_evaluation +from detectron.datasets.json_dataset import JsonDataset +from detectron.modeling import model_builder +from detectron.utils.io import save_object +from detectron.utils.timer import Timer +import detectron.utils.blob as blob_utils +import detectron.utils.c2 as c2_utils +import detectron.utils.env as envu +import detectron.utils.net as nu +import detectron.utils.subprocess as subprocess_utils + +logger = logging.getLogger(__name__) + + +def generate_rpn_on_dataset( + weights_file, + dataset_name, + _proposal_file_ignored, + output_dir, + multi_gpu=False, + gpu_id=0 +): + """Run inference on a dataset.""" + dataset = JsonDataset(dataset_name) + test_timer = Timer() + test_timer.tic() + if multi_gpu: + num_images = len(dataset.get_roidb()) + _boxes, _scores, _ids, rpn_file = multi_gpu_generate_rpn_on_dataset( + weights_file, dataset_name, _proposal_file_ignored, num_images, + output_dir + ) + else: + # Processes entire dataset range by default + _boxes, _scores, _ids, rpn_file = generate_rpn_on_range( + weights_file, + dataset_name, + _proposal_file_ignored, + output_dir, + gpu_id=gpu_id + ) + test_timer.toc() + logger.info('Total inference time: {:.3f}s'.format(test_timer.average_time)) + return evaluate_proposal_file(dataset, rpn_file, output_dir) + + +def multi_gpu_generate_rpn_on_dataset( + weights_file, dataset_name, _proposal_file_ignored, num_images, output_dir +): + """Multi-gpu inference on a dataset.""" + # Retrieve the test_net binary path + binary_dir = envu.get_runtime_dir() + binary_ext = envu.get_py_bin_ext() + binary = os.path.join(binary_dir, 'test_net' + binary_ext) + assert os.path.exists(binary), 'Binary \'{}\' not found'.format(binary) + + # Pass the target dataset via the command line + opts = ['TEST.DATASETS', '("{}",)'.format(dataset_name)] + opts += ['TEST.WEIGHTS', weights_file] + + # Run inference in parallel in subprocesses + outputs = subprocess_utils.process_in_parallel( + 'rpn_proposals', num_images, binary, output_dir, opts + ) + + # Collate the results from each subprocess + boxes, scores, ids = [], [], [] + for rpn_data in outputs: + boxes += rpn_data['boxes'] + scores += rpn_data['scores'] + ids += rpn_data['ids'] + rpn_file = os.path.join(output_dir, 'rpn_proposals.pkl') + cfg_yaml = envu.yaml_dump(cfg) + save_object( + dict(boxes=boxes, scores=scores, ids=ids, cfg=cfg_yaml), rpn_file + ) + logger.info('Wrote RPN proposals to {}'.format(os.path.abspath(rpn_file))) + return boxes, scores, ids, rpn_file + + +def generate_rpn_on_range( + weights_file, + dataset_name, + _proposal_file_ignored, + output_dir, + ind_range=None, + gpu_id=0 +): + """Run inference on all images in a dataset or over an index range of images + in a dataset using a single GPU. + """ + assert cfg.MODEL.RPN_ONLY or cfg.MODEL.FASTER_RCNN + + roidb, start_ind, end_ind, total_num_images = get_roidb( + dataset_name, ind_range + ) + logger.info( + 'Output will be saved to: {:s}'.format(os.path.abspath(output_dir)) + ) + + model = model_builder.create(cfg.MODEL.TYPE, train=False, gpu_id=gpu_id) + nu.initialize_gpu_from_weights_file( + model, weights_file, gpu_id=gpu_id, + ) + model_builder.add_inference_inputs(model) + workspace.CreateNet(model.net) + + boxes, scores, ids = generate_proposals_on_roidb( + model, + roidb, + start_ind=start_ind, + end_ind=end_ind, + total_num_images=total_num_images, + gpu_id=gpu_id, + ) + + cfg_yaml = envu.yaml_dump(cfg) + if ind_range is not None: + rpn_name = 'rpn_proposals_range_%s_%s.pkl' % tuple(ind_range) + else: + rpn_name = 'rpn_proposals.pkl' + rpn_file = os.path.join(output_dir, rpn_name) + save_object( + dict(boxes=boxes, scores=scores, ids=ids, cfg=cfg_yaml), rpn_file + ) + logger.info('Wrote RPN proposals to {}'.format(os.path.abspath(rpn_file))) + return boxes, scores, ids, rpn_file + + +def generate_proposals_on_roidb( + model, roidb, start_ind=None, end_ind=None, total_num_images=None, + gpu_id=0, +): + """Generate RPN proposals on all images in an imdb.""" + _t = Timer() + num_images = len(roidb) + roidb_boxes = [[] for _ in range(num_images)] + roidb_scores = [[] for _ in range(num_images)] + roidb_ids = [[] for _ in range(num_images)] + if start_ind is None: + start_ind = 0 + end_ind = num_images + total_num_images = num_images + for i in range(num_images): + roidb_ids[i] = roidb[i]['id'] + im = cv2.imread(roidb[i]['image']) + with c2_utils.NamedCudaScope(gpu_id): + _t.tic() + roidb_boxes[i], roidb_scores[i] = im_proposals(model, im) + _t.toc() + if i % 10 == 0: + ave_time = _t.average_time + eta_seconds = ave_time * (num_images - i - 1) + eta = str(datetime.timedelta(seconds=int(eta_seconds))) + logger.info( + ( + 'rpn_generate: range [{:d}, {:d}] of {:d}: ' + '{:d}/{:d} {:.3f}s (eta: {})' + ).format( + start_ind + 1, end_ind, total_num_images, start_ind + i + 1, + start_ind + num_images, ave_time, eta + ) + ) + + return roidb_boxes, roidb_scores, roidb_ids + + +def im_proposals(model, im): + """Generate RPN proposals on a single image.""" + inputs = {} + inputs['data'], im_scale, inputs['im_info'] = \ + blob_utils.get_image_blob(im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE) + for k, v in inputs.items(): + workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32, copy=False)) + workspace.RunNet(model.net.Proto().name) + + if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN: + k_max = cfg.FPN.RPN_MAX_LEVEL + k_min = cfg.FPN.RPN_MIN_LEVEL + rois_names = [ + core.ScopedName('rpn_rois_fpn' + str(l)) + for l in range(k_min, k_max + 1) + ] + score_names = [ + core.ScopedName('rpn_roi_probs_fpn' + str(l)) + for l in range(k_min, k_max + 1) + ] + blobs = workspace.FetchBlobs(rois_names + score_names) + # Combine predictions across all levels and retain the top scoring + boxes = np.concatenate(blobs[:len(rois_names)]) + scores = np.concatenate(blobs[len(rois_names):]).squeeze() + # Discussion: one could do NMS again after combining predictions from + # the different FPN levels. Conceptually, it's probably the right thing + # to do. For arbitrary reasons, the original FPN RPN implementation did + # not do another round of NMS. + inds = np.argsort(-scores)[:cfg.TEST.RPN_POST_NMS_TOP_N] + scores = scores[inds] + boxes = boxes[inds, :] + else: + boxes, scores = workspace.FetchBlobs( + [core.ScopedName('rpn_rois'), + core.ScopedName('rpn_roi_probs')] + ) + scores = scores.squeeze() + + # Column 0 is the batch index in the (batch ind, x1, y1, x2, y2) encoding, + # so we remove it since we just want to return boxes + # Scale proposals back to the original input image scale + boxes = boxes[:, 1:] / im_scale + return boxes, scores + + +def get_roidb(dataset_name, ind_range): + """Get the roidb for the dataset specified in the global cfg. Optionally + restrict it to a range of indices if ind_range is a pair of integers. + """ + dataset = JsonDataset(dataset_name) + roidb = dataset.get_roidb() + + if ind_range is not None: + total_num_images = len(roidb) + start, end = ind_range + roidb = roidb[start:end] + else: + start = 0 + end = len(roidb) + total_num_images = end + + return roidb, start, end, total_num_images + + +def evaluate_proposal_file(dataset, proposal_file, output_dir): + """Evaluate box proposal average recall.""" + roidb = dataset.get_roidb(gt=True, proposal_file=proposal_file) + results = task_evaluation.evaluate_box_proposals(dataset, roidb) + task_evaluation.log_box_proposal_results(results) + recall_file = os.path.join(output_dir, 'rpn_proposal_recall.pkl') + save_object(results, recall_file) + return results diff --git a/detectron/core/test.py b/detectron/core/test.py new file mode 100644 index 0000000000000000000000000000000000000000..3f9980681f2aaaf1fe93bb545478cdbfa1a90879 --- /dev/null +++ b/detectron/core/test.py @@ -0,0 +1,949 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Inference functionality for most Detectron models.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import defaultdict +import cv2 +import logging +import numpy as np + +from caffe2.python import core +from caffe2.python import workspace +import pycocotools.mask as mask_util + +from detectron.core.config import cfg +from detectron.utils.timer import Timer +import detectron.core.test_retinanet as test_retinanet +import detectron.modeling.FPN as fpn +import detectron.utils.blob as blob_utils +import detectron.utils.boxes as box_utils +import detectron.utils.image as image_utils +import detectron.utils.keypoints as keypoint_utils + +logger = logging.getLogger(__name__) + + +def im_detect_all(model, im, box_proposals, timers=None): + if timers is None: + timers = defaultdict(Timer) + + # Handle RetinaNet testing separately for now + if cfg.RETINANET.RETINANET_ON: + cls_boxes = test_retinanet.im_detect_bbox(model, im, timers) + return cls_boxes, None, None + + timers['im_detect_bbox'].tic() + if cfg.TEST.BBOX_AUG.ENABLED: + scores, boxes, im_scale = im_detect_bbox_aug(model, im, box_proposals) + else: + scores, boxes, im_scale = im_detect_bbox( + model, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, boxes=box_proposals + ) + timers['im_detect_bbox'].toc() + + # score and boxes are from the whole image after score thresholding and nms + # (they are not separated by class) + # cls_boxes boxes and scores are separated by class and in the format used + # for evaluating results + timers['misc_bbox'].tic() + scores, boxes, cls_boxes = box_results_with_nms_and_limit(scores, boxes) + timers['misc_bbox'].toc() + + if cfg.MODEL.MASK_ON and boxes.shape[0] > 0: + timers['im_detect_mask'].tic() + if cfg.TEST.MASK_AUG.ENABLED: + masks = im_detect_mask_aug(model, im, boxes) + else: + masks = im_detect_mask(model, im_scale, boxes) + timers['im_detect_mask'].toc() + + timers['misc_mask'].tic() + cls_segms = segm_results( + cls_boxes, masks, boxes, im.shape[0], im.shape[1] + ) + timers['misc_mask'].toc() + else: + cls_segms = None + + if cfg.MODEL.KEYPOINTS_ON and boxes.shape[0] > 0: + timers['im_detect_keypoints'].tic() + if cfg.TEST.KPS_AUG.ENABLED: + heatmaps = im_detect_keypoints_aug(model, im, boxes) + else: + heatmaps = im_detect_keypoints(model, im_scale, boxes) + timers['im_detect_keypoints'].toc() + + timers['misc_keypoints'].tic() + cls_keyps = keypoint_results(cls_boxes, heatmaps, boxes) + timers['misc_keypoints'].toc() + else: + cls_keyps = None + + return cls_boxes, cls_segms, cls_keyps + + +def im_conv_body_only(model, im, target_scale, target_max_size): + """Runs `model.conv_body_net` on the given image `im`.""" + im_blob, im_scale, _im_info = blob_utils.get_image_blob( + im, target_scale, target_max_size + ) + workspace.FeedBlob(core.ScopedName('data'), im_blob) + workspace.RunNet(model.conv_body_net.Proto().name) + return im_scale + + +def im_detect_bbox(model, im, target_scale, target_max_size, boxes=None): + """Bounding box object detection for an image with given box proposals. + + Arguments: + model (DetectionModelHelper): the detection model to use + im (ndarray): color image to test (in BGR order) + boxes (ndarray): R x 4 array of object proposals in 0-indexed + [x1, y1, x2, y2] format, or None if using RPN + + Returns: + scores (ndarray): R x K array of object class scores for K classes + (K includes background as object category 0) + boxes (ndarray): R x 4*K array of predicted bounding boxes + im_scales (list): list of image scales used in the input blob (as + returned by _get_blobs and for use with im_detect_mask, etc.) + """ + inputs, im_scale = _get_blobs(im, boxes, target_scale, target_max_size) + + # When mapping from image ROIs to feature map ROIs, there's some aliasing + # (some distinct image ROIs get mapped to the same feature ROI). + # Here, we identify duplicate feature ROIs, so we only compute features + # on the unique subset. + if cfg.DEDUP_BOXES > 0 and not cfg.MODEL.FASTER_RCNN: + v = np.array([1, 1e3, 1e6, 1e9, 1e12]) + hashes = np.round(inputs['rois'] * cfg.DEDUP_BOXES).dot(v) + _, index, inv_index = np.unique( + hashes, return_index=True, return_inverse=True + ) + inputs['rois'] = inputs['rois'][index, :] + boxes = boxes[index, :] + + # Add multi-level rois for FPN + if cfg.FPN.MULTILEVEL_ROIS and not cfg.MODEL.FASTER_RCNN: + _add_multilevel_rois_for_test(inputs, 'rois') + + for k, v in inputs.items(): + workspace.FeedBlob(core.ScopedName(k), v) + workspace.RunNet(model.net.Proto().name) + + # Read out blobs + if cfg.MODEL.FASTER_RCNN: + rois = workspace.FetchBlob(core.ScopedName('rois')) + # unscale back to raw image space + boxes = rois[:, 1:5] / im_scale + + # Softmax class probabilities + scores = workspace.FetchBlob(core.ScopedName('cls_prob')).squeeze() + # In case there is 1 proposal + scores = scores.reshape([-1, scores.shape[-1]]) + + if cfg.TEST.BBOX_REG: + # Apply bounding-box regression deltas + box_deltas = workspace.FetchBlob(core.ScopedName('bbox_pred')).squeeze() + # In case there is 1 proposal + box_deltas = box_deltas.reshape([-1, box_deltas.shape[-1]]) + if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG: + # Remove predictions for bg class (compat with MSRA code) + box_deltas = box_deltas[:, -4:] + pred_boxes = box_utils.bbox_transform( + boxes, box_deltas, cfg.MODEL.BBOX_REG_WEIGHTS + ) + pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im.shape) + if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG: + pred_boxes = np.tile(pred_boxes, (1, scores.shape[1])) + else: + # Simply repeat the boxes, once for each class + pred_boxes = np.tile(boxes, (1, scores.shape[1])) + + if cfg.DEDUP_BOXES > 0 and not cfg.MODEL.FASTER_RCNN: + # Map scores and predictions back to the original set of boxes + scores = scores[inv_index, :] + pred_boxes = pred_boxes[inv_index, :] + + return scores, pred_boxes, im_scale + + +def im_detect_bbox_aug(model, im, box_proposals=None): + """Performs bbox detection with test-time augmentations. + Function signature is the same as for im_detect_bbox. + """ + assert not cfg.TEST.BBOX_AUG.SCALE_SIZE_DEP, \ + 'Size dependent scaling not implemented' + assert not cfg.TEST.BBOX_AUG.SCORE_HEUR == 'UNION' or \ + cfg.TEST.BBOX_AUG.COORD_HEUR == 'UNION', \ + 'Coord heuristic must be union whenever score heuristic is union' + assert not cfg.TEST.BBOX_AUG.COORD_HEUR == 'UNION' or \ + cfg.TEST.BBOX_AUG.SCORE_HEUR == 'UNION', \ + 'Score heuristic must be union whenever coord heuristic is union' + assert not cfg.MODEL.FASTER_RCNN or \ + cfg.TEST.BBOX_AUG.SCORE_HEUR == 'UNION', \ + 'Union heuristic must be used to combine Faster RCNN predictions' + + # Collect detections computed under different transformations + scores_ts = [] + boxes_ts = [] + + def add_preds_t(scores_t, boxes_t): + scores_ts.append(scores_t) + boxes_ts.append(boxes_t) + + # Perform detection on the horizontally flipped image + if cfg.TEST.BBOX_AUG.H_FLIP: + scores_hf, boxes_hf, _ = im_detect_bbox_hflip( + model, + im, + cfg.TEST.SCALE, + cfg.TEST.MAX_SIZE, + box_proposals=box_proposals + ) + add_preds_t(scores_hf, boxes_hf) + + # Compute detections at different scales + for scale in cfg.TEST.BBOX_AUG.SCALES: + max_size = cfg.TEST.BBOX_AUG.MAX_SIZE + scores_scl, boxes_scl = im_detect_bbox_scale( + model, im, scale, max_size, box_proposals + ) + add_preds_t(scores_scl, boxes_scl) + + if cfg.TEST.BBOX_AUG.SCALE_H_FLIP: + scores_scl_hf, boxes_scl_hf = im_detect_bbox_scale( + model, im, scale, max_size, box_proposals, hflip=True + ) + add_preds_t(scores_scl_hf, boxes_scl_hf) + + # Perform detection at different aspect ratios + for aspect_ratio in cfg.TEST.BBOX_AUG.ASPECT_RATIOS: + scores_ar, boxes_ar = im_detect_bbox_aspect_ratio( + model, im, aspect_ratio, box_proposals + ) + add_preds_t(scores_ar, boxes_ar) + + if cfg.TEST.BBOX_AUG.ASPECT_RATIO_H_FLIP: + scores_ar_hf, boxes_ar_hf = im_detect_bbox_aspect_ratio( + model, im, aspect_ratio, box_proposals, hflip=True + ) + add_preds_t(scores_ar_hf, boxes_ar_hf) + + # Compute detections for the original image (identity transform) last to + # ensure that the Caffe2 workspace is populated with blobs corresponding + # to the original image on return (postcondition of im_detect_bbox) + scores_i, boxes_i, im_scale_i = im_detect_bbox( + model, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, boxes=box_proposals + ) + add_preds_t(scores_i, boxes_i) + + # Combine the predicted scores + if cfg.TEST.BBOX_AUG.SCORE_HEUR == 'ID': + scores_c = scores_i + elif cfg.TEST.BBOX_AUG.SCORE_HEUR == 'AVG': + scores_c = np.mean(scores_ts, axis=0) + elif cfg.TEST.BBOX_AUG.SCORE_HEUR == 'UNION': + scores_c = np.vstack(scores_ts) + else: + raise NotImplementedError( + 'Score heur {} not supported'.format(cfg.TEST.BBOX_AUG.SCORE_HEUR) + ) + + # Combine the predicted boxes + if cfg.TEST.BBOX_AUG.COORD_HEUR == 'ID': + boxes_c = boxes_i + elif cfg.TEST.BBOX_AUG.COORD_HEUR == 'AVG': + boxes_c = np.mean(boxes_ts, axis=0) + elif cfg.TEST.BBOX_AUG.COORD_HEUR == 'UNION': + boxes_c = np.vstack(boxes_ts) + else: + raise NotImplementedError( + 'Coord heur {} not supported'.format(cfg.TEST.BBOX_AUG.COORD_HEUR) + ) + + return scores_c, boxes_c, im_scale_i + + +def im_detect_bbox_hflip( + model, im, target_scale, target_max_size, box_proposals=None +): + """Performs bbox detection on the horizontally flipped image. + Function signature is the same as for im_detect_bbox. + """ + # Compute predictions on the flipped image + im_hf = im[:, ::-1, :] + im_width = im.shape[1] + + if not cfg.MODEL.FASTER_RCNN: + box_proposals_hf = box_utils.flip_boxes(box_proposals, im_width) + else: + box_proposals_hf = None + + scores_hf, boxes_hf, im_scale = im_detect_bbox( + model, im_hf, target_scale, target_max_size, boxes=box_proposals_hf + ) + + # Invert the detections computed on the flipped image + boxes_inv = box_utils.flip_boxes(boxes_hf, im_width) + + return scores_hf, boxes_inv, im_scale + + +def im_detect_bbox_scale( + model, im, target_scale, target_max_size, box_proposals=None, hflip=False +): + """Computes bbox detections at the given scale. + Returns predictions in the original image space. + """ + if hflip: + scores_scl, boxes_scl, _ = im_detect_bbox_hflip( + model, im, target_scale, target_max_size, box_proposals=box_proposals + ) + else: + scores_scl, boxes_scl, _ = im_detect_bbox( + model, im, target_scale, target_max_size, boxes=box_proposals + ) + return scores_scl, boxes_scl + + +def im_detect_bbox_aspect_ratio( + model, im, aspect_ratio, box_proposals=None, hflip=False +): + """Computes bbox detections at the given width-relative aspect ratio. + Returns predictions in the original image space. + """ + # Compute predictions on the transformed image + im_ar = image_utils.aspect_ratio_rel(im, aspect_ratio) + + if not cfg.MODEL.FASTER_RCNN: + box_proposals_ar = box_utils.aspect_ratio(box_proposals, aspect_ratio) + else: + box_proposals_ar = None + + if hflip: + scores_ar, boxes_ar, _ = im_detect_bbox_hflip( + model, + im_ar, + cfg.TEST.SCALE, + cfg.TEST.MAX_SIZE, + box_proposals=box_proposals_ar + ) + else: + scores_ar, boxes_ar, _ = im_detect_bbox( + model, + im_ar, + cfg.TEST.SCALE, + cfg.TEST.MAX_SIZE, + boxes=box_proposals_ar + ) + + # Invert the detected boxes + boxes_inv = box_utils.aspect_ratio(boxes_ar, 1.0 / aspect_ratio) + + return scores_ar, boxes_inv + + +def im_detect_mask(model, im_scale, boxes): + """Infer instance segmentation masks. This function must be called after + im_detect_bbox as it assumes that the Caffe2 workspace is already populated + with the necessary blobs. + + Arguments: + model (DetectionModelHelper): the detection model to use + im_scales (list): image blob scales as returned by im_detect_bbox + boxes (ndarray): R x 4 array of bounding box detections (e.g., as + returned by im_detect_bbox) + + Returns: + pred_masks (ndarray): R x K x M x M array of class specific soft masks + output by the network (must be processed by segm_results to convert + into hard masks in the original image coordinate space) + """ + M = cfg.MRCNN.RESOLUTION + if boxes.shape[0] == 0: + pred_masks = np.zeros((0, M, M), np.float32) + return pred_masks + + inputs = {'mask_rois': _get_rois_blob(boxes, im_scale)} + # Add multi-level rois for FPN + if cfg.FPN.MULTILEVEL_ROIS: + _add_multilevel_rois_for_test(inputs, 'mask_rois') + + for k, v in inputs.items(): + workspace.FeedBlob(core.ScopedName(k), v) + workspace.RunNet(model.mask_net.Proto().name) + + # Fetch masks + pred_masks = workspace.FetchBlob( + core.ScopedName('mask_fcn_probs') + ).squeeze() + + if cfg.MRCNN.CLS_SPECIFIC_MASK: + pred_masks = pred_masks.reshape([-1, cfg.MODEL.NUM_CLASSES, M, M]) + else: + pred_masks = pred_masks.reshape([-1, 1, M, M]) + + return pred_masks + + +def im_detect_mask_aug(model, im, boxes): + """Performs mask detection with test-time augmentations. + + Arguments: + model (DetectionModelHelper): the detection model to use + im (ndarray): BGR image to test + boxes (ndarray): R x 4 array of bounding boxes + + Returns: + masks (ndarray): R x K x M x M array of class specific soft masks + """ + assert not cfg.TEST.MASK_AUG.SCALE_SIZE_DEP, \ + 'Size dependent scaling not implemented' + + # Collect masks computed under different transformations + masks_ts = [] + + # Compute masks for the original image (identity transform) + im_scale_i = im_conv_body_only(model, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE) + masks_i = im_detect_mask(model, im_scale_i, boxes) + masks_ts.append(masks_i) + + # Perform mask detection on the horizontally flipped image + if cfg.TEST.MASK_AUG.H_FLIP: + masks_hf = im_detect_mask_hflip( + model, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, boxes + ) + masks_ts.append(masks_hf) + + # Compute detections at different scales + for scale in cfg.TEST.MASK_AUG.SCALES: + max_size = cfg.TEST.MASK_AUG.MAX_SIZE + masks_scl = im_detect_mask_scale(model, im, scale, max_size, boxes) + masks_ts.append(masks_scl) + + if cfg.TEST.MASK_AUG.SCALE_H_FLIP: + masks_scl_hf = im_detect_mask_scale( + model, im, scale, max_size, boxes, hflip=True + ) + masks_ts.append(masks_scl_hf) + + # Compute masks at different aspect ratios + for aspect_ratio in cfg.TEST.MASK_AUG.ASPECT_RATIOS: + masks_ar = im_detect_mask_aspect_ratio(model, im, aspect_ratio, boxes) + masks_ts.append(masks_ar) + + if cfg.TEST.MASK_AUG.ASPECT_RATIO_H_FLIP: + masks_ar_hf = im_detect_mask_aspect_ratio( + model, im, aspect_ratio, boxes, hflip=True + ) + masks_ts.append(masks_ar_hf) + + # Combine the predicted soft masks + if cfg.TEST.MASK_AUG.HEUR == 'SOFT_AVG': + masks_c = np.mean(masks_ts, axis=0) + elif cfg.TEST.MASK_AUG.HEUR == 'SOFT_MAX': + masks_c = np.amax(masks_ts, axis=0) + elif cfg.TEST.MASK_AUG.HEUR == 'LOGIT_AVG': + + def logit(y): + return -1.0 * np.log((1.0 - y) / np.maximum(y, 1e-20)) + + logit_masks = [logit(y) for y in masks_ts] + logit_masks = np.mean(logit_masks, axis=0) + masks_c = 1.0 / (1.0 + np.exp(-logit_masks)) + else: + raise NotImplementedError( + 'Heuristic {} not supported'.format(cfg.TEST.MASK_AUG.HEUR) + ) + + return masks_c + + +def im_detect_mask_hflip(model, im, target_scale, target_max_size, boxes): + """Performs mask detection on the horizontally flipped image. + Function signature is the same as for im_detect_mask_aug. + """ + # Compute the masks for the flipped image + im_hf = im[:, ::-1, :] + boxes_hf = box_utils.flip_boxes(boxes, im.shape[1]) + + im_scale = im_conv_body_only(model, im_hf, target_scale, target_max_size) + masks_hf = im_detect_mask(model, im_scale, boxes_hf) + + # Invert the predicted soft masks + masks_inv = masks_hf[:, :, :, ::-1] + + return masks_inv + + +def im_detect_mask_scale( + model, im, target_scale, target_max_size, boxes, hflip=False +): + """Computes masks at the given scale.""" + if hflip: + masks_scl = im_detect_mask_hflip( + model, im, target_scale, target_max_size, boxes + ) + else: + im_scale = im_conv_body_only(model, im, target_scale, target_max_size) + masks_scl = im_detect_mask(model, im_scale, boxes) + return masks_scl + + +def im_detect_mask_aspect_ratio(model, im, aspect_ratio, boxes, hflip=False): + """Computes mask detections at the given width-relative aspect ratio.""" + + # Perform mask detection on the transformed image + im_ar = image_utils.aspect_ratio_rel(im, aspect_ratio) + boxes_ar = box_utils.aspect_ratio(boxes, aspect_ratio) + + if hflip: + masks_ar = im_detect_mask_hflip( + model, im_ar, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, boxes_ar + ) + else: + im_scale = im_conv_body_only( + model, im_ar, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE + ) + masks_ar = im_detect_mask(model, im_scale, boxes_ar) + + return masks_ar + + +def im_detect_keypoints(model, im_scale, boxes): + """Infer instance keypoint poses. This function must be called after + im_detect_bbox as it assumes that the Caffe2 workspace is already populated + with the necessary blobs. + + Arguments: + model (DetectionModelHelper): the detection model to use + im_scales (list): image blob scales as returned by im_detect_bbox + boxes (ndarray): R x 4 array of bounding box detections (e.g., as + returned by im_detect_bbox) + + Returns: + pred_heatmaps (ndarray): R x J x M x M array of keypoint location + logits (softmax inputs) for each of the J keypoint types output + by the network (must be processed by keypoint_results to convert + into point predictions in the original image coordinate space) + """ + M = cfg.KRCNN.HEATMAP_SIZE + if boxes.shape[0] == 0: + pred_heatmaps = np.zeros((0, cfg.KRCNN.NUM_KEYPOINTS, M, M), np.float32) + return pred_heatmaps + + inputs = {'keypoint_rois': _get_rois_blob(boxes, im_scale)} + + # Add multi-level rois for FPN + if cfg.FPN.MULTILEVEL_ROIS: + _add_multilevel_rois_for_test(inputs, 'keypoint_rois') + + for k, v in inputs.items(): + workspace.FeedBlob(core.ScopedName(k), v) + workspace.RunNet(model.keypoint_net.Proto().name) + + pred_heatmaps = workspace.FetchBlob(core.ScopedName('kps_score')).squeeze() + + # In case of 1 + if pred_heatmaps.ndim == 3: + pred_heatmaps = np.expand_dims(pred_heatmaps, axis=0) + + return pred_heatmaps + + +def im_detect_keypoints_aug(model, im, boxes): + """Computes keypoint predictions with test-time augmentations. + + Arguments: + model (DetectionModelHelper): the detection model to use + im (ndarray): BGR image to test + boxes (ndarray): R x 4 array of bounding boxes + + Returns: + heatmaps (ndarray): R x J x M x M array of keypoint location logits + """ + + # Collect heatmaps predicted under different transformations + heatmaps_ts = [] + # Tag predictions computed under downscaling and upscaling transformations + ds_ts = [] + us_ts = [] + + def add_heatmaps_t(heatmaps_t, ds_t=False, us_t=False): + heatmaps_ts.append(heatmaps_t) + ds_ts.append(ds_t) + us_ts.append(us_t) + + # Compute the heatmaps for the original image (identity transform) + im_scale = im_conv_body_only(model, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE) + heatmaps_i = im_detect_keypoints(model, im_scale, boxes) + add_heatmaps_t(heatmaps_i) + + # Perform keypoints detection on the horizontally flipped image + if cfg.TEST.KPS_AUG.H_FLIP: + heatmaps_hf = im_detect_keypoints_hflip( + model, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, boxes + ) + add_heatmaps_t(heatmaps_hf) + + # Compute detections at different scales + for scale in cfg.TEST.KPS_AUG.SCALES: + ds_scl = scale < cfg.TEST.SCALE + us_scl = scale > cfg.TEST.SCALE + heatmaps_scl = im_detect_keypoints_scale( + model, im, scale, cfg.TEST.KPS_AUG.MAX_SIZE, boxes + ) + add_heatmaps_t(heatmaps_scl, ds_scl, us_scl) + + if cfg.TEST.KPS_AUG.SCALE_H_FLIP: + heatmaps_scl_hf = im_detect_keypoints_scale( + model, im, scale, cfg.TEST.KPS_AUG.MAX_SIZE, boxes, hflip=True + ) + add_heatmaps_t(heatmaps_scl_hf, ds_scl, us_scl) + + # Compute keypoints at different aspect ratios + for aspect_ratio in cfg.TEST.KPS_AUG.ASPECT_RATIOS: + heatmaps_ar = im_detect_keypoints_aspect_ratio( + model, im, aspect_ratio, boxes + ) + add_heatmaps_t(heatmaps_ar) + + if cfg.TEST.KPS_AUG.ASPECT_RATIO_H_FLIP: + heatmaps_ar_hf = im_detect_keypoints_aspect_ratio( + model, im, aspect_ratio, boxes, hflip=True + ) + add_heatmaps_t(heatmaps_ar_hf) + + # Select the heuristic function for combining the heatmaps + if cfg.TEST.KPS_AUG.HEUR == 'HM_AVG': + np_f = np.mean + elif cfg.TEST.KPS_AUG.HEUR == 'HM_MAX': + np_f = np.amax + else: + raise NotImplementedError( + 'Heuristic {} not supported'.format(cfg.TEST.KPS_AUG.HEUR) + ) + + def heur_f(hms_ts): + return np_f(hms_ts, axis=0) + + # Combine the heatmaps + if cfg.TEST.KPS_AUG.SCALE_SIZE_DEP: + heatmaps_c = combine_heatmaps_size_dep( + heatmaps_ts, ds_ts, us_ts, boxes, heur_f + ) + else: + heatmaps_c = heur_f(heatmaps_ts) + + return heatmaps_c + + +def im_detect_keypoints_hflip(model, im, target_scale, target_max_size, boxes): + """Computes keypoint predictions on the horizontally flipped image. + Function signature is the same as for im_detect_keypoints_aug. + """ + # Compute keypoints for the flipped image + im_hf = im[:, ::-1, :] + boxes_hf = box_utils.flip_boxes(boxes, im.shape[1]) + + im_scale = im_conv_body_only(model, im_hf, target_scale, target_max_size) + heatmaps_hf = im_detect_keypoints(model, im_scale, boxes_hf) + + # Invert the predicted keypoints + heatmaps_inv = keypoint_utils.flip_heatmaps(heatmaps_hf) + + return heatmaps_inv + + +def im_detect_keypoints_scale( + model, im, target_scale, target_max_size, boxes, hflip=False +): + """Computes keypoint predictions at the given scale.""" + if hflip: + heatmaps_scl = im_detect_keypoints_hflip( + model, im, target_scale, target_max_size, boxes + ) + else: + im_scale = im_conv_body_only(model, im, target_scale, target_max_size) + heatmaps_scl = im_detect_keypoints(model, im_scale, boxes) + return heatmaps_scl + + +def im_detect_keypoints_aspect_ratio( + model, im, aspect_ratio, boxes, hflip=False +): + """Detects keypoints at the given width-relative aspect ratio.""" + + # Perform keypoint detectionon the transformed image + im_ar = image_utils.aspect_ratio_rel(im, aspect_ratio) + boxes_ar = box_utils.aspect_ratio(boxes, aspect_ratio) + + if hflip: + heatmaps_ar = im_detect_keypoints_hflip( + model, im_ar, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, boxes_ar + ) + else: + im_scale = im_conv_body_only( + model, im_ar, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE + ) + heatmaps_ar = im_detect_keypoints(model, im_scale, boxes_ar) + + return heatmaps_ar + + +def combine_heatmaps_size_dep(hms_ts, ds_ts, us_ts, boxes, heur_f): + """Combines heatmaps while taking object sizes into account.""" + assert len(hms_ts) == len(ds_ts) and len(ds_ts) == len(us_ts), \ + 'All sets of hms must be tagged with downscaling and upscaling flags' + + # Classify objects into small+medium and large based on their box areas + areas = box_utils.boxes_area(boxes) + sm_objs = areas < cfg.TEST.KPS_AUG.AREA_TH + l_objs = areas >= cfg.TEST.KPS_AUG.AREA_TH + + # Combine heatmaps computed under different transformations for each object + hms_c = np.zeros_like(hms_ts[0]) + + for i in range(hms_c.shape[0]): + hms_to_combine = [] + for hms_t, ds_t, us_t in zip(hms_ts, ds_ts, us_ts): + # Discard downscaling predictions for small and medium objects + if sm_objs[i] and ds_t: + continue + # Discard upscaling predictions for large objects + if l_objs[i] and us_t: + continue + hms_to_combine.append(hms_t[i]) + hms_c[i] = heur_f(hms_to_combine) + + return hms_c + + +def box_results_with_nms_and_limit(scores, boxes): + """Returns bounding-box detection results by thresholding on scores and + applying non-maximum suppression (NMS). + + `boxes` has shape (#detections, 4 * #classes), where each row represents + a list of predicted bounding boxes for each of the object classes in the + dataset (including the background class). The detections in each row + originate from the same object proposal. + + `scores` has shape (#detection, #classes), where each row represents a list + of object detection confidence scores for each of the object classes in the + dataset (including the background class). `scores[i, j]`` corresponds to the + box at `boxes[i, j * 4:(j + 1) * 4]`. + """ + num_classes = cfg.MODEL.NUM_CLASSES + cls_boxes = [[] for _ in range(num_classes)] + # Apply threshold on detection probabilities and apply NMS + # Skip j = 0, because it's the background class + for j in range(1, num_classes): + inds = np.where(scores[:, j] > cfg.TEST.SCORE_THRESH)[0] + scores_j = scores[inds, j] + boxes_j = boxes[inds, j * 4:(j + 1) * 4] + dets_j = np.hstack((boxes_j, scores_j[:, np.newaxis])).astype( + np.float32, copy=False + ) + if cfg.TEST.SOFT_NMS.ENABLED: + nms_dets, _ = box_utils.soft_nms( + dets_j, + sigma=cfg.TEST.SOFT_NMS.SIGMA, + overlap_thresh=cfg.TEST.NMS, + score_thresh=0.0001, + method=cfg.TEST.SOFT_NMS.METHOD + ) + else: + keep = box_utils.nms(dets_j, cfg.TEST.NMS) + nms_dets = dets_j[keep, :] + # Refine the post-NMS boxes using bounding-box voting + if cfg.TEST.BBOX_VOTE.ENABLED: + nms_dets = box_utils.box_voting( + nms_dets, + dets_j, + cfg.TEST.BBOX_VOTE.VOTE_TH, + scoring_method=cfg.TEST.BBOX_VOTE.SCORING_METHOD + ) + cls_boxes[j] = nms_dets + + # Limit to max_per_image detections **over all classes** + if cfg.TEST.DETECTIONS_PER_IM > 0: + image_scores = np.hstack( + [cls_boxes[j][:, -1] for j in range(1, num_classes)] + ) + if len(image_scores) > cfg.TEST.DETECTIONS_PER_IM: + image_thresh = np.sort(image_scores)[-cfg.TEST.DETECTIONS_PER_IM] + for j in range(1, num_classes): + keep = np.where(cls_boxes[j][:, -1] >= image_thresh)[0] + cls_boxes[j] = cls_boxes[j][keep, :] + + im_results = np.vstack([cls_boxes[j] for j in range(1, num_classes)]) + boxes = im_results[:, :-1] + scores = im_results[:, -1] + return scores, boxes, cls_boxes + + +def segm_results(cls_boxes, masks, ref_boxes, im_h, im_w): + num_classes = cfg.MODEL.NUM_CLASSES + cls_segms = [[] for _ in range(num_classes)] + mask_ind = 0 + # To work around an issue with cv2.resize (it seems to automatically pad + # with repeated border values), we manually zero-pad the masks by 1 pixel + # prior to resizing back to the original image resolution. This prevents + # "top hat" artifacts. We therefore need to expand the reference boxes by an + # appropriate factor. + M = cfg.MRCNN.RESOLUTION + scale = (M + 2.0) / M + ref_boxes = box_utils.expand_boxes(ref_boxes, scale) + ref_boxes = ref_boxes.astype(np.int32) + padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32) + + # skip j = 0, because it's the background class + for j in range(1, num_classes): + segms = [] + for _ in range(cls_boxes[j].shape[0]): + if cfg.MRCNN.CLS_SPECIFIC_MASK: + padded_mask[1:-1, 1:-1] = masks[mask_ind, j, :, :] + else: + padded_mask[1:-1, 1:-1] = masks[mask_ind, 0, :, :] + + ref_box = ref_boxes[mask_ind, :] + w = ref_box[2] - ref_box[0] + 1 + h = ref_box[3] - ref_box[1] + 1 + w = np.maximum(w, 1) + h = np.maximum(h, 1) + + mask = cv2.resize(padded_mask, (w, h)) + mask = np.array(mask > cfg.MRCNN.THRESH_BINARIZE, dtype=np.uint8) + im_mask = np.zeros((im_h, im_w), dtype=np.uint8) + + x_0 = max(ref_box[0], 0) + x_1 = min(ref_box[2] + 1, im_w) + y_0 = max(ref_box[1], 0) + y_1 = min(ref_box[3] + 1, im_h) + + im_mask[y_0:y_1, x_0:x_1] = mask[ + (y_0 - ref_box[1]):(y_1 - ref_box[1]), + (x_0 - ref_box[0]):(x_1 - ref_box[0]) + ] + + # Get RLE encoding used by the COCO evaluation API + rle = mask_util.encode( + np.array(im_mask[:, :, np.newaxis], order='F') + )[0] + segms.append(rle) + + mask_ind += 1 + + cls_segms[j] = segms + + assert mask_ind == masks.shape[0] + return cls_segms + + +def keypoint_results(cls_boxes, pred_heatmaps, ref_boxes): + num_classes = cfg.MODEL.NUM_CLASSES + cls_keyps = [[] for _ in range(num_classes)] + person_idx = keypoint_utils.get_person_class_index() + xy_preds = keypoint_utils.heatmaps_to_keypoints(pred_heatmaps, ref_boxes) + + # NMS OKS + if cfg.KRCNN.NMS_OKS: + keep = keypoint_utils.nms_oks(xy_preds, ref_boxes, 0.3) + xy_preds = xy_preds[keep, :, :] + ref_boxes = ref_boxes[keep, :] + pred_heatmaps = pred_heatmaps[keep, :, :, :] + cls_boxes[person_idx] = cls_boxes[person_idx][keep, :] + + kps = [xy_preds[i] for i in range(xy_preds.shape[0])] + cls_keyps[person_idx] = kps + return cls_keyps + + +def _get_rois_blob(im_rois, im_scale): + """Converts RoIs into network inputs. + + Arguments: + im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates + im_scale_factors (list): scale factors as returned by _get_image_blob + + Returns: + blob (ndarray): R x 5 matrix of RoIs in the image pyramid with columns + [level, x1, y1, x2, y2] + """ + rois, levels = _project_im_rois(im_rois, im_scale) + rois_blob = np.hstack((levels, rois)) + return rois_blob.astype(np.float32, copy=False) + + +def _project_im_rois(im_rois, scales): + """Project image RoIs into the image pyramid built by _get_image_blob. + + Arguments: + im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates + scales (list): scale factors as returned by _get_image_blob + + Returns: + rois (ndarray): R x 4 matrix of projected RoI coordinates + levels (ndarray): image pyramid levels used by each projected RoI + """ + rois = im_rois.astype(np.float, copy=False) * scales + levels = np.zeros((im_rois.shape[0], 1), dtype=np.int) + return rois, levels + + +def _add_multilevel_rois_for_test(blobs, name): + """Distributes a set of RoIs across FPN pyramid levels by creating new level + specific RoI blobs. + + Arguments: + blobs (dict): dictionary of blobs + name (str): a key in 'blobs' identifying the source RoI blob + + Returns: + [by ref] blobs (dict): new keys named by `name + 'fpn' + level` + are added to dict each with a value that's an R_level x 5 ndarray of + RoIs (see _get_rois_blob for format) + """ + lvl_min = cfg.FPN.ROI_MIN_LEVEL + lvl_max = cfg.FPN.ROI_MAX_LEVEL + lvls = fpn.map_rois_to_fpn_levels(blobs[name][:, 1:5], lvl_min, lvl_max) + fpn.add_multilevel_roi_blobs( + blobs, name, blobs[name], lvls, lvl_min, lvl_max + ) + + +def _get_blobs(im, rois, target_scale, target_max_size): + """Convert an image and RoIs within that image into network inputs.""" + blobs = {} + blobs['data'], im_scale, blobs['im_info'] = \ + blob_utils.get_image_blob(im, target_scale, target_max_size) + if rois is not None: + blobs['rois'] = _get_rois_blob(rois, im_scale) + return blobs, im_scale diff --git a/detectron/core/test_engine.py b/detectron/core/test_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..f69bc20dd2d97a46db8ef5fb7617482b054c330b --- /dev/null +++ b/detectron/core/test_engine.py @@ -0,0 +1,395 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Test a Detectron network on an imdb (image database).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import defaultdict +import cv2 +import datetime +import logging +import numpy as np +import os + +from caffe2.python import workspace + +from detectron.core.config import cfg +from detectron.core.config import get_output_dir +from detectron.core.rpn_generator import generate_rpn_on_dataset +from detectron.core.rpn_generator import generate_rpn_on_range +from detectron.core.test import im_detect_all +from detectron.datasets import task_evaluation +from detectron.datasets.json_dataset import JsonDataset +from detectron.modeling import model_builder +from detectron.utils.io import save_object +from detectron.utils.timer import Timer +import detectron.utils.c2 as c2_utils +import detectron.utils.env as envu +import detectron.utils.net as net_utils +import detectron.utils.subprocess as subprocess_utils +import detectron.utils.vis as vis_utils + +logger = logging.getLogger(__name__) + + +def get_eval_functions(): + # Determine which parent or child function should handle inference + if cfg.MODEL.RPN_ONLY: + child_func = generate_rpn_on_range + parent_func = generate_rpn_on_dataset + else: + # Generic case that handles all network types other than RPN-only nets + # and RetinaNet + child_func = test_net + parent_func = test_net_on_dataset + + return parent_func, child_func + + +def get_inference_dataset(index, is_parent=True): + assert is_parent or len(cfg.TEST.DATASETS) == 1, \ + 'The child inference process can only work on a single dataset' + + dataset_name = cfg.TEST.DATASETS[index] + + if cfg.TEST.PRECOMPUTED_PROPOSALS: + assert is_parent or len(cfg.TEST.PROPOSAL_FILES) == 1, \ + 'The child inference process can only work on a single proposal file' + assert len(cfg.TEST.PROPOSAL_FILES) == len(cfg.TEST.DATASETS), \ + 'If proposals are used, one proposal file must be specified for ' \ + 'each dataset' + proposal_file = cfg.TEST.PROPOSAL_FILES[index] + else: + proposal_file = None + + return dataset_name, proposal_file + + +def run_inference( + weights_file, ind_range=None, + multi_gpu_testing=False, gpu_id=0, + check_expected_results=False, +): + parent_func, child_func = get_eval_functions() + is_parent = ind_range is None + + def result_getter(): + if is_parent: + # Parent case: + # In this case we're either running inference on the entire dataset in a + # single process or (if multi_gpu_testing is True) using this process to + # launch subprocesses that each run inference on a range of the dataset + all_results = {} + for i in range(len(cfg.TEST.DATASETS)): + dataset_name, proposal_file = get_inference_dataset(i) + output_dir = get_output_dir(dataset_name, training=False) + results = parent_func( + weights_file, + dataset_name, + proposal_file, + output_dir, + multi_gpu=multi_gpu_testing + ) + all_results.update(results) + + return all_results + else: + # Subprocess child case: + # In this case test_net was called via subprocess.Popen to execute on a + # range of inputs on a single dataset + dataset_name, proposal_file = get_inference_dataset(0, is_parent=False) + output_dir = get_output_dir(dataset_name, training=False) + return child_func( + weights_file, + dataset_name, + proposal_file, + output_dir, + ind_range=ind_range, + gpu_id=gpu_id + ) + + all_results = result_getter() + if check_expected_results and is_parent: + task_evaluation.check_expected_results( + all_results, + atol=cfg.EXPECTED_RESULTS_ATOL, + rtol=cfg.EXPECTED_RESULTS_RTOL + ) + task_evaluation.log_copy_paste_friendly_results(all_results) + + return all_results + + +def test_net_on_dataset( + weights_file, + dataset_name, + proposal_file, + output_dir, + multi_gpu=False, + gpu_id=0 +): + """Run inference on a dataset.""" + dataset = JsonDataset(dataset_name) + test_timer = Timer() + test_timer.tic() + if multi_gpu: + num_images = len(dataset.get_roidb()) + all_boxes, all_segms, all_keyps = multi_gpu_test_net_on_dataset( + weights_file, dataset_name, proposal_file, num_images, output_dir + ) + else: + all_boxes, all_segms, all_keyps = test_net( + weights_file, dataset_name, proposal_file, output_dir, gpu_id=gpu_id + ) + test_timer.toc() + logger.info('Total inference time: {:.3f}s'.format(test_timer.average_time)) + results = task_evaluation.evaluate_all( + dataset, all_boxes, all_segms, all_keyps, output_dir + ) + return results + + +def multi_gpu_test_net_on_dataset( + weights_file, dataset_name, proposal_file, num_images, output_dir +): + """Multi-gpu inference on a dataset.""" + binary_dir = envu.get_runtime_dir() + binary_ext = envu.get_py_bin_ext() + binary = os.path.join(binary_dir, 'test_net' + binary_ext) + assert os.path.exists(binary), 'Binary \'{}\' not found'.format(binary) + + # Pass the target dataset and proposal file (if any) via the command line + opts = ['TEST.DATASETS', '("{}",)'.format(dataset_name)] + opts += ['TEST.WEIGHTS', weights_file] + if proposal_file: + opts += ['TEST.PROPOSAL_FILES', '("{}",)'.format(proposal_file)] + + # Run inference in parallel in subprocesses + # Outputs will be a list of outputs from each subprocess, where the output + # of each subprocess is the dictionary saved by test_net(). + outputs = subprocess_utils.process_in_parallel( + 'detection', num_images, binary, output_dir, opts + ) + + # Collate the results from each subprocess + all_boxes = [[] for _ in range(cfg.MODEL.NUM_CLASSES)] + all_segms = [[] for _ in range(cfg.MODEL.NUM_CLASSES)] + all_keyps = [[] for _ in range(cfg.MODEL.NUM_CLASSES)] + for det_data in outputs: + all_boxes_batch = det_data['all_boxes'] + all_segms_batch = det_data['all_segms'] + all_keyps_batch = det_data['all_keyps'] + for cls_idx in range(1, cfg.MODEL.NUM_CLASSES): + all_boxes[cls_idx] += all_boxes_batch[cls_idx] + all_segms[cls_idx] += all_segms_batch[cls_idx] + all_keyps[cls_idx] += all_keyps_batch[cls_idx] + det_file = os.path.join(output_dir, 'detections.pkl') + cfg_yaml = envu.yaml_dump(cfg) + save_object( + dict( + all_boxes=all_boxes, + all_segms=all_segms, + all_keyps=all_keyps, + cfg=cfg_yaml + ), det_file + ) + logger.info('Wrote detections to: {}'.format(os.path.abspath(det_file))) + + return all_boxes, all_segms, all_keyps + + +def test_net( + weights_file, + dataset_name, + proposal_file, + output_dir, + ind_range=None, + gpu_id=0 +): + """Run inference on all images in a dataset or over an index range of images + in a dataset using a single GPU. + """ + assert not cfg.MODEL.RPN_ONLY, \ + 'Use rpn_generate to generate proposals from RPN-only models' + + roidb, dataset, start_ind, end_ind, total_num_images = get_roidb_and_dataset( + dataset_name, proposal_file, ind_range + ) + model = initialize_model_from_cfg(weights_file, gpu_id=gpu_id) + num_images = len(roidb) + num_classes = cfg.MODEL.NUM_CLASSES + all_boxes, all_segms, all_keyps = empty_results(num_classes, num_images) + timers = defaultdict(Timer) + for i, entry in enumerate(roidb): + if cfg.TEST.PRECOMPUTED_PROPOSALS: + # The roidb may contain ground-truth rois (for example, if the roidb + # comes from the training or val split). We only want to evaluate + # detection on the *non*-ground-truth rois. We select only the rois + # that have the gt_classes field set to 0, which means there's no + # ground truth. + box_proposals = entry['boxes'][entry['gt_classes'] == 0] + if len(box_proposals) == 0: + continue + else: + # Faster R-CNN type models generate proposals on-the-fly with an + # in-network RPN; 1-stage models don't require proposals. + box_proposals = None + + im = cv2.imread(entry['image']) + with c2_utils.NamedCudaScope(gpu_id): + cls_boxes_i, cls_segms_i, cls_keyps_i = im_detect_all( + model, im, box_proposals, timers + ) + + extend_results(i, all_boxes, cls_boxes_i) + if cls_segms_i is not None: + extend_results(i, all_segms, cls_segms_i) + if cls_keyps_i is not None: + extend_results(i, all_keyps, cls_keyps_i) + + if i % 10 == 0: # Reduce log file size + ave_total_time = np.sum([t.average_time for t in timers.values()]) + eta_seconds = ave_total_time * (num_images - i - 1) + eta = str(datetime.timedelta(seconds=int(eta_seconds))) + det_time = ( + timers['im_detect_bbox'].average_time + + timers['im_detect_mask'].average_time + + timers['im_detect_keypoints'].average_time + ) + misc_time = ( + timers['misc_bbox'].average_time + + timers['misc_mask'].average_time + + timers['misc_keypoints'].average_time + ) + logger.info( + ( + 'im_detect: range [{:d}, {:d}] of {:d}: ' + '{:d}/{:d} {:.3f}s + {:.3f}s (eta: {})' + ).format( + start_ind + 1, end_ind, total_num_images, start_ind + i + 1, + start_ind + num_images, det_time, misc_time, eta + ) + ) + + if cfg.VIS: + im_name = os.path.splitext(os.path.basename(entry['image']))[0] + vis_utils.vis_one_image( + im[:, :, ::-1], + '{:d}_{:s}'.format(i, im_name), + os.path.join(output_dir, 'vis'), + cls_boxes_i, + segms=cls_segms_i, + keypoints=cls_keyps_i, + thresh=cfg.VIS_TH, + box_alpha=0.8, + dataset=dataset, + show_class=True + ) + + cfg_yaml = envu.yaml_dump(cfg) + if ind_range is not None: + det_name = 'detection_range_%s_%s.pkl' % tuple(ind_range) + else: + det_name = 'detections.pkl' + det_file = os.path.join(output_dir, det_name) + save_object( + dict( + all_boxes=all_boxes, + all_segms=all_segms, + all_keyps=all_keyps, + cfg=cfg_yaml + ), det_file + ) + logger.info('Wrote detections to: {}'.format(os.path.abspath(det_file))) + return all_boxes, all_segms, all_keyps + + +def initialize_model_from_cfg(weights_file, gpu_id=0): + """Initialize a model from the global cfg. Loads test-time weights and + creates the networks in the Caffe2 workspace. + """ + model = model_builder.create(cfg.MODEL.TYPE, train=False, gpu_id=gpu_id) + net_utils.initialize_gpu_from_weights_file( + model, weights_file, gpu_id=gpu_id, + ) + model_builder.add_inference_inputs(model) + workspace.CreateNet(model.net) + workspace.CreateNet(model.conv_body_net) + if cfg.MODEL.MASK_ON: + workspace.CreateNet(model.mask_net) + if cfg.MODEL.KEYPOINTS_ON: + workspace.CreateNet(model.keypoint_net) + return model + + +def get_roidb_and_dataset(dataset_name, proposal_file, ind_range): + """Get the roidb for the dataset specified in the global cfg. Optionally + restrict it to a range of indices if ind_range is a pair of integers. + """ + dataset = JsonDataset(dataset_name) + if cfg.TEST.PRECOMPUTED_PROPOSALS: + assert proposal_file, 'No proposal file given' + roidb = dataset.get_roidb( + proposal_file=proposal_file, + proposal_limit=cfg.TEST.PROPOSAL_LIMIT + ) + else: + roidb = dataset.get_roidb() + + if ind_range is not None: + total_num_images = len(roidb) + start, end = ind_range + roidb = roidb[start:end] + else: + start = 0 + end = len(roidb) + total_num_images = end + + return roidb, dataset, start, end, total_num_images + + +def empty_results(num_classes, num_images): + """Return empty results lists for boxes, masks, and keypoints. + Box detections are collected into: + all_boxes[cls][image] = N x 5 array with columns (x1, y1, x2, y2, score) + Instance mask predictions are collected into: + all_segms[cls][image] = [...] list of COCO RLE encoded masks that are in + 1:1 correspondence with the boxes in all_boxes[cls][image] + Keypoint predictions are collected into: + all_keyps[cls][image] = [...] list of keypoints results, each encoded as + a 3D array (#rois, 4, #keypoints) with the 4 rows corresponding to + [x, y, logit, prob] (See: utils.keypoints.heatmaps_to_keypoints). + Keypoints are recorded for person (cls = 1); they are in 1:1 + correspondence with the boxes in all_boxes[cls][image]. + """ + # Note: do not be tempted to use [[] * N], which gives N references to the + # *same* empty list. + all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)] + all_segms = [[[] for _ in range(num_images)] for _ in range(num_classes)] + all_keyps = [[[] for _ in range(num_images)] for _ in range(num_classes)] + return all_boxes, all_segms, all_keyps + + +def extend_results(index, all_res, im_res): + """Add results for an image to the set of all results at the specified + index. + """ + # Skip cls_idx 0 (__background__) + for cls_idx in range(1, len(im_res)): + all_res[cls_idx][index] = im_res[cls_idx] diff --git a/detectron/core/test_retinanet.py b/detectron/core/test_retinanet.py new file mode 100644 index 0000000000000000000000000000000000000000..2c4e4f3e87e199338999180051ecd03295379ae4 --- /dev/null +++ b/detectron/core/test_retinanet.py @@ -0,0 +1,200 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Test a RetinaNet network on an image database""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import logging +from collections import defaultdict + +from caffe2.python import core, workspace + +from detectron.core.config import cfg +from detectron.modeling.generate_anchors import generate_anchors +from detectron.utils.timer import Timer +import detectron.utils.blob as blob_utils +import detectron.utils.boxes as box_utils + +logger = logging.getLogger(__name__) + + +def _create_cell_anchors(): + """ + Generate all types of anchors for all fpn levels/scales/aspect ratios. + This function is called only once at the beginning of inference. + """ + k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL + scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE + aspect_ratios = cfg.RETINANET.ASPECT_RATIOS + anchor_scale = cfg.RETINANET.ANCHOR_SCALE + A = scales_per_octave * len(aspect_ratios) + anchors = {} + for lvl in range(k_min, k_max + 1): + # create cell anchors array + stride = 2. ** lvl + cell_anchors = np.zeros((A, 4)) + a = 0 + for octave in range(scales_per_octave): + octave_scale = 2 ** (octave / float(scales_per_octave)) + for aspect in aspect_ratios: + anchor_sizes = (stride * octave_scale * anchor_scale, ) + anchor_aspect_ratios = (aspect, ) + cell_anchors[a, :] = generate_anchors( + stride=stride, sizes=anchor_sizes, + aspect_ratios=anchor_aspect_ratios) + a += 1 + anchors[lvl] = cell_anchors + return anchors + + +def im_detect_bbox(model, im, timers=None): + """Generate RetinaNet detections on a single image.""" + if timers is None: + timers = defaultdict(Timer) + # Although anchors are input independent and could be precomputed, + # recomputing them per image only brings a small overhead + anchors = _create_cell_anchors() + timers['im_detect_bbox'].tic() + k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL + A = cfg.RETINANET.SCALES_PER_OCTAVE * len(cfg.RETINANET.ASPECT_RATIOS) + inputs = {} + inputs['data'], im_scale, inputs['im_info'] = \ + blob_utils.get_image_blob(im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE) + cls_probs, box_preds = [], [] + for lvl in range(k_min, k_max + 1): + suffix = 'fpn{}'.format(lvl) + cls_probs.append(core.ScopedName('retnet_cls_prob_{}'.format(suffix))) + box_preds.append(core.ScopedName('retnet_bbox_pred_{}'.format(suffix))) + for k, v in inputs.items(): + workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32, copy=False)) + + workspace.RunNet(model.net.Proto().name) + cls_probs = workspace.FetchBlobs(cls_probs) + box_preds = workspace.FetchBlobs(box_preds) + + # here the boxes_all are [x0, y0, x1, y1, score] + boxes_all = defaultdict(list) + + cnt = 0 + for lvl in range(k_min, k_max + 1): + # create cell anchors array + stride = 2. ** lvl + cell_anchors = anchors[lvl] + + # fetch per level probability + cls_prob = cls_probs[cnt] + box_pred = box_preds[cnt] + cls_prob = cls_prob.reshape(( + cls_prob.shape[0], A, int(cls_prob.shape[1] / A), + cls_prob.shape[2], cls_prob.shape[3])) + box_pred = box_pred.reshape(( + box_pred.shape[0], A, 4, box_pred.shape[2], box_pred.shape[3])) + cnt += 1 + + if cfg.RETINANET.SOFTMAX: + cls_prob = cls_prob[:, :, 1::, :, :] + + cls_prob_ravel = cls_prob.ravel() + # In some cases [especially for very small img sizes], it's possible that + # candidate_ind is empty if we impose threshold 0.05 at all levels. This + # will lead to errors since no detections are found for this image. Hence, + # for lvl 7 which has small spatial resolution, we take the threshold 0.0 + th = cfg.RETINANET.INFERENCE_TH if lvl < k_max else 0.0 + candidate_inds = np.where(cls_prob_ravel > th)[0] + if (len(candidate_inds) == 0): + continue + + pre_nms_topn = min(cfg.RETINANET.PRE_NMS_TOP_N, len(candidate_inds)) + inds = np.argpartition( + cls_prob_ravel[candidate_inds], -pre_nms_topn)[-pre_nms_topn:] + inds = candidate_inds[inds] + + inds_5d = np.array(np.unravel_index(inds, cls_prob.shape)).transpose() + classes = inds_5d[:, 2] + anchor_ids, y, x = inds_5d[:, 1], inds_5d[:, 3], inds_5d[:, 4] + scores = cls_prob[:, anchor_ids, classes, y, x] + + boxes = np.column_stack((x, y, x, y)).astype(dtype=np.float32) + boxes *= stride + boxes += cell_anchors[anchor_ids, :] + + if not cfg.RETINANET.CLASS_SPECIFIC_BBOX: + box_deltas = box_pred[0, anchor_ids, :, y, x] + else: + box_cls_inds = classes * 4 + box_deltas = np.vstack( + [box_pred[0, ind:ind + 4, yi, xi] + for ind, yi, xi in zip(box_cls_inds, y, x)] + ) + pred_boxes = ( + box_utils.bbox_transform(boxes, box_deltas) + if cfg.TEST.BBOX_REG else boxes) + pred_boxes /= im_scale + pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im.shape) + box_scores = np.zeros((pred_boxes.shape[0], 5)) + box_scores[:, 0:4] = pred_boxes + box_scores[:, 4] = scores + + for cls in range(1, cfg.MODEL.NUM_CLASSES): + inds = np.where(classes == cls - 1)[0] + if len(inds) > 0: + boxes_all[cls].extend(box_scores[inds, :]) + timers['im_detect_bbox'].toc() + + # Combine predictions across all levels and retain the top scoring by class + timers['misc_bbox'].tic() + detections = [] + for cls, boxes in boxes_all.items(): + cls_dets = np.vstack(boxes).astype(dtype=np.float32) + # do class specific nms here + if cfg.TEST.SOFT_NMS.ENABLED: + cls_dets, keep = box_utils.soft_nms( + cls_dets, + sigma=cfg.TEST.SOFT_NMS.SIGMA, + overlap_thresh=cfg.TEST.NMS, + score_thresh=0.0001, + method=cfg.TEST.SOFT_NMS.METHOD + ) + else: + keep = box_utils.nms(cls_dets, cfg.TEST.NMS) + cls_dets = cls_dets[keep, :] + out = np.zeros((len(keep), 6)) + out[:, 0:5] = cls_dets + out[:, 5].fill(cls) + detections.append(out) + + # detections (N, 6) format: + # detections[:, :4] - boxes + # detections[:, 4] - scores + # detections[:, 5] - classes + detections = np.vstack(detections) + # sort all again + inds = np.argsort(-detections[:, 4]) + detections = detections[inds[0:cfg.TEST.DETECTIONS_PER_IM], :] + + # Convert the detections to image cls_ format (see core/test_engine.py) + num_classes = cfg.MODEL.NUM_CLASSES + cls_boxes = [[] for _ in range(cfg.MODEL.NUM_CLASSES)] + for c in range(1, num_classes): + inds = np.where(detections[:, 5] == c)[0] + cls_boxes[c] = detections[inds, :5] + timers['misc_bbox'].toc() + + return cls_boxes diff --git a/detectron/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m b/detectron/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m new file mode 100644 index 0000000000000000000000000000000000000000..629597a1f1d1f978f0065f7b57b4c9eae1957f87 --- /dev/null +++ b/detectron/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m @@ -0,0 +1,14 @@ +function VOCopts = get_voc_opts(path) + +tmp = pwd; +cd(path); +try + addpath('VOCcode'); + VOCinit; +catch + rmpath('VOCcode'); + cd(tmp); + error(sprintf('VOCcode directory not found under %s', path)); +end +rmpath('VOCcode'); +cd(tmp); diff --git a/detectron/datasets/VOCdevkit-matlab-wrapper/voc_eval.m b/detectron/datasets/VOCdevkit-matlab-wrapper/voc_eval.m new file mode 100644 index 0000000000000000000000000000000000000000..1911a0e39b91ba8e2a2c1157b8c93e5e65829141 --- /dev/null +++ b/detectron/datasets/VOCdevkit-matlab-wrapper/voc_eval.m @@ -0,0 +1,56 @@ +function res = voc_eval(path, comp_id, test_set, output_dir) + +VOCopts = get_voc_opts(path); +VOCopts.testset = test_set; + +for i = 1:length(VOCopts.classes) + cls = VOCopts.classes{i}; + res(i) = voc_eval_cls(cls, VOCopts, comp_id, output_dir); +end + +fprintf('\n~~~~~~~~~~~~~~~~~~~~\n'); +fprintf('Results:\n'); +aps = [res(:).ap]'; +fprintf('%.1f\n', aps * 100); +fprintf('%.1f\n', mean(aps) * 100); +fprintf('~~~~~~~~~~~~~~~~~~~~\n'); + +function res = voc_eval_cls(cls, VOCopts, comp_id, output_dir) + +test_set = VOCopts.testset; +year = VOCopts.dataset(4:end); + +addpath(fullfile(VOCopts.datadir, 'VOCcode')); + +res_fn = sprintf(VOCopts.detrespath, comp_id, cls); + +recall = []; +prec = []; +ap = 0; +ap_auc = 0; + +do_eval = (str2num(year) <= 2007) | ~strcmp(test_set, 'test'); +if do_eval + % Bug in VOCevaldet requires that tic has been called first + tic; + [recall, prec, ap] = VOCevaldet(VOCopts, comp_id, cls, true); + ap_auc = xVOCap(recall, prec); + + % force plot limits + ylim([0 1]); + xlim([0 1]); + + print(gcf, '-djpeg', '-r0', ... + [output_dir '/' cls '_pr.jpg']); +end +fprintf('!!! %s : %.4f %.4f\n', cls, ap, ap_auc); + +res.recall = recall; +res.prec = prec; +res.ap = ap; +res.ap_auc = ap_auc; + +save([output_dir '/' cls '_pr.mat'], ... + 'res', 'recall', 'prec', 'ap', 'ap_auc'); + +rmpath(fullfile(VOCopts.datadir, 'VOCcode')); diff --git a/detectron/datasets/VOCdevkit-matlab-wrapper/xVOCap.m b/detectron/datasets/VOCdevkit-matlab-wrapper/xVOCap.m new file mode 100644 index 0000000000000000000000000000000000000000..7e8024fd1400adcddbcffc988bbc99e2399b7781 --- /dev/null +++ b/detectron/datasets/VOCdevkit-matlab-wrapper/xVOCap.m @@ -0,0 +1,10 @@ +function ap = xVOCap(rec,prec) +% From the PASCAL VOC 2011 devkit + +mrec=[0 ; rec ; 1]; +mpre=[0 ; prec ; 0]; +for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); +end +i=find(mrec(2:end)~=mrec(1:end-1))+1; +ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); diff --git a/detectron/datasets/__init__.py b/detectron/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/detectron/datasets/cityscapes_json_dataset_evaluator.py b/detectron/datasets/cityscapes_json_dataset_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..af7dbccb023b206b7adde1f3459a9418b923fa3c --- /dev/null +++ b/detectron/datasets/cityscapes_json_dataset_evaluator.py @@ -0,0 +1,95 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Functions for evaluating results on Cityscapes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import logging +import os +import uuid + +import pycocotools.mask as mask_util + +from detectron.core.config import cfg +from detectron.datasets.dataset_catalog import get_raw_dir + +logger = logging.getLogger(__name__) + + +def evaluate_masks( + json_dataset, + all_boxes, + all_segms, + output_dir, + use_salt=True, + cleanup=False +): + if cfg.CLUSTER.ON_CLUSTER: + # On the cluster avoid saving these files in the job directory + output_dir = '/tmp' + res_file = os.path.join( + output_dir, 'segmentations_' + json_dataset.name + '_results') + if use_salt: + res_file += '_{}'.format(str(uuid.uuid4())) + res_file += '.json' + + results_dir = os.path.join(output_dir, 'results') + if not os.path.exists(results_dir): + os.mkdir(results_dir) + + os.environ['CITYSCAPES_DATASET'] = get_raw_dir(json_dataset.name) + os.environ['CITYSCAPES_RESULTS'] = output_dir + + # Load the Cityscapes eval script *after* setting the required env vars, + # since the script reads their values into global variables (at load time). + import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling \ + as cityscapes_eval + + roidb = json_dataset.get_roidb() + for i, entry in enumerate(roidb): + im_name = entry['image'] + + basename = os.path.splitext(os.path.basename(im_name))[0] + txtname = os.path.join(output_dir, basename + 'pred.txt') + with open(txtname, 'w') as fid_txt: + if i % 10 == 0: + logger.info('i: {}: {}'.format(i, basename)) + for j in range(1, len(all_segms)): + clss = json_dataset.classes[j] + clss_id = cityscapes_eval.name2label[clss].id + segms = all_segms[j][i] + boxes = all_boxes[j][i] + if segms == []: + continue + masks = mask_util.decode(segms) + + for k in range(boxes.shape[0]): + score = boxes[k, -1] + mask = masks[:, :, k] + pngname = os.path.join( + 'results', + basename + '_' + clss + '_{}.png'.format(k)) + # write txt + fid_txt.write('{} {} {}\n'.format(pngname, clss_id, score)) + # save mask + cv2.imwrite(os.path.join(output_dir, pngname), mask * 255) + logger.info('Evaluating...') + cityscapes_eval.main([]) + return None diff --git a/detectron/datasets/coco_to_cityscapes_id.py b/detectron/datasets/coco_to_cityscapes_id.py new file mode 100644 index 0000000000000000000000000000000000000000..7bf56184f2b07adf3b22b08abfea1c985cd1c75a --- /dev/null +++ b/detectron/datasets/coco_to_cityscapes_id.py @@ -0,0 +1,95 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# mapping coco categories to cityscapes (our converted json) id +# cityscapes +# INFO roidb.py: 220: 1 bicycle: 7286 +# INFO roidb.py: 220: 2 car: 53684 +# INFO roidb.py: 220: 3 person: 35704 +# INFO roidb.py: 220: 4 train: 336 +# INFO roidb.py: 220: 5 truck: 964 +# INFO roidb.py: 220: 6 motorcycle: 1468 +# INFO roidb.py: 220: 7 bus: 758 +# INFO roidb.py: 220: 8 rider: 3504 + +# coco (val5k) +# INFO roidb.py: 220: 1 person: 21296 +# INFO roidb.py: 220: 2 bicycle: 628 +# INFO roidb.py: 220: 3 car: 3818 +# INFO roidb.py: 220: 4 motorcycle: 732 +# INFO roidb.py: 220: 5 airplane: 286 <------ irrelevant +# INFO roidb.py: 220: 6 bus: 564 +# INFO roidb.py: 220: 7 train: 380 +# INFO roidb.py: 220: 8 truck: 828 + + +def cityscapes_to_coco(cityscapes_id): + lookup = { + 0: 0, # ... background + 1: 2, # bicycle + 2: 3, # car + 3: 1, # person + 4: 7, # train + 5: 8, # truck + 6: 4, # motorcycle + 7: 6, # bus + 8: -1, # rider (-1 means rand init) + } + return lookup[cityscapes_id] + + +def cityscapes_to_coco_with_rider(cityscapes_id): + lookup = { + 0: 0, # ... background + 1: 2, # bicycle + 2: 3, # car + 3: 1, # person + 4: 7, # train + 5: 8, # truck + 6: 4, # motorcycle + 7: 6, # bus + 8: 1, # rider ("person", *rider has human right!*) + } + return lookup[cityscapes_id] + + +def cityscapes_to_coco_without_person_rider(cityscapes_id): + lookup = { + 0: 0, # ... background + 1: 2, # bicycle + 2: 3, # car + 3: -1, # person (ignore) + 4: 7, # train + 5: 8, # truck + 6: 4, # motorcycle + 7: 6, # bus + 8: -1, # rider (ignore) + } + return lookup[cityscapes_id] + + +def cityscapes_to_coco_all_random(cityscapes_id): + lookup = { + 0: -1, # ... background + 1: -1, # bicycle + 2: -1, # car + 3: -1, # person (ignore) + 4: -1, # train + 5: -1, # truck + 6: -1, # motorcycle + 7: -1, # bus + 8: -1, # rider (ignore) + } + return lookup[cityscapes_id] diff --git a/detectron/datasets/data/README.md b/detectron/datasets/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..64b201a81457d4d2a9e1d91ef553616f507ea3a3 --- /dev/null +++ b/detectron/datasets/data/README.md @@ -0,0 +1,103 @@ +# Setting Up Datasets + +This directory contains symlinks to data locations. + +## Creating Symlinks for COCO + +Symlink the COCO dataset: + +``` +ln -s /path/to/coco $DETECTRON/detectron/datasets/data/coco +``` + +We assume that your local COCO dataset copy at `/path/to/coco` has the following directory structure: + +``` +coco +|_ coco_train2014 +| |_ .jpg +| |_ ... +| |_ .jpg +|_ coco_val2014 +|_ ... +|_ annotations + |_ instances_train2014.json + |_ ... +``` + +If that is not the case, you may need to do something similar to: + +``` +mkdir -p $DETECTRON/detectron/datasets/data/coco +ln -s /path/to/coco_train2014 $DETECTRON/detectron/datasets/data/coco/coco_train2014 +ln -s /path/to/coco_val2014 $DETECTRON/detectron/datasets/data/coco/coco_val2014 +ln -s /path/to/json/annotations $DETECTRON/detectron/datasets/data/coco/annotations +``` + +### COCO Minival Annotations + +Our custom `minival` and `valminusminival` annotations are available for download [here](https://dl.fbaipublicfiles.com/detectron/coco/coco_annotations_minival.tgz). +Please note that `minival` is exactly equivalent to the recently defined 2017 `val` set. +Similarly, the union of `valminusminival` and the 2014 `train` is exactly equivalent to the 2017 `train` set. To complete installation of the COCO dataset, you will need to copy the `minival` and `valminusminival` json annotation files to the `coco/annotations` directory referenced above. + +## Creating Symlinks for PASCAL VOC + +We assume that your symlinked `detectron/datasets/data/VOC` directory has the following structure: + +``` +VOC +|_ JPEGImages +| |_ .jpg +| |_ ... +| |_ .jpg +|_ annotations +| |_ voc__train.json +| |_ voc__val.json +| |_ ... +|_ VOCdevkit +``` + +Create symlinks for `VOC`: + +``` +mkdir -p $DETECTRON/detectron/datasets/data/VOC +ln -s /path/to/VOC/JPEGImages $DETECTRON/detectron/datasets/data/VOC/JPEGImages +ln -s /path/to/VOC/json/annotations $DETECTRON/detectron/datasets/data/VOC/annotations +ln -s /path/to/VOC/devkit $DETECTRON/detectron/datasets/data/VOC/VOCdevkit +``` + +### PASCAL VOC Annotations in COCO Format + +We expect PASCAL VOC annotations converted to COCO json format, which are available for download [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip ). + +## Creating Symlinks for Cityscapes: + +We assume that your symlinked `detectron/datasets/data/cityscapes` directory has the following structure: + +``` +cityscapes +|_ images +| |_ .jpg +| |_ ... +| |_ .jpg +|_ annotations +| |_ instanceonly_gtFile_train.json +| |_ ... +|_ raw + |_ gtFine + |_ ... + |_ README.md +``` + +Create symlinks for `cityscapes`: + +``` +mkdir -p $DETECTRON/detectron/datasets/data/cityscapes +ln -s /path/to/cityscapes/images $DETECTRON/detectron/datasets/data/cityscapes/images +ln -s /path/to/cityscapes/json/annotations $DETECTRON/detectron/datasets/data/cityscapes/annotations +ln -s /path/to/cityscapes/root $DETECTRON/detectron/datasets/data/cityscapes/raw +``` + +### Cityscapes Annotations in COCO Format + +We expect Cityscapes annotations converted to COCO json format, which we will make available for download soon. diff --git a/detectron/datasets/dataset_catalog.py b/detectron/datasets/dataset_catalog.py new file mode 100644 index 0000000000000000000000000000000000000000..b92487e6ebf83f7de0d4fdddf163d2c99adc67cc --- /dev/null +++ b/detectron/datasets/dataset_catalog.py @@ -0,0 +1,240 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Collection of available datasets.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os + + +# Path to data dir +_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') + +# Required dataset entry keys +_IM_DIR = 'image_directory' +_ANN_FN = 'annotation_file' + +# Optional dataset entry keys +_IM_PREFIX = 'image_prefix' +_DEVKIT_DIR = 'devkit_directory' +_RAW_DIR = 'raw_dir' + +# Available datasets +_DATASETS = { + 'cityscapes_fine_instanceonly_seg_train': { + _IM_DIR: + _DATA_DIR + '/cityscapes/images', + _ANN_FN: + _DATA_DIR + '/cityscapes/annotations/instancesonly_gtFine_train.json', + _RAW_DIR: + _DATA_DIR + '/cityscapes/raw' + }, + 'cityscapes_fine_instanceonly_seg_val': { + _IM_DIR: + _DATA_DIR + '/cityscapes/images', + # use filtered validation as there is an issue converting contours + _ANN_FN: + _DATA_DIR + '/cityscapes/annotations/instancesonly_filtered_gtFine_val.json', + _RAW_DIR: + _DATA_DIR + '/cityscapes/raw' + }, + 'cityscapes_fine_instanceonly_seg_test': { + _IM_DIR: + _DATA_DIR + '/cityscapes/images', + _ANN_FN: + _DATA_DIR + '/cityscapes/annotations/instancesonly_gtFine_test.json', + _RAW_DIR: + _DATA_DIR + '/cityscapes/raw' + }, + 'coco_2014_train': { + _IM_DIR: + _DATA_DIR + '/coco/coco_train2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/instances_train2014.json' + }, + 'coco_2014_val': { + _IM_DIR: + _DATA_DIR + '/coco/coco_val2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/instances_val2014.json' + }, + 'coco_2014_minival': { + _IM_DIR: + _DATA_DIR + '/coco/coco_val2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/instances_minival2014.json' + }, + 'coco_2014_valminusminival': { + _IM_DIR: + _DATA_DIR + '/coco/coco_val2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/instances_valminusminival2014.json' + }, + 'coco_2015_test': { + _IM_DIR: + _DATA_DIR + '/coco/coco_test2015', + _ANN_FN: + _DATA_DIR + '/coco/annotations/image_info_test2015.json' + }, + 'coco_2015_test-dev': { + _IM_DIR: + _DATA_DIR + '/coco/coco_test2015', + _ANN_FN: + _DATA_DIR + '/coco/annotations/image_info_test-dev2015.json' + }, + 'coco_2017_test': { # 2017 test uses 2015 test images + _IM_DIR: + _DATA_DIR + '/coco/coco_test2015', + _ANN_FN: + _DATA_DIR + '/coco/annotations/image_info_test2017.json', + _IM_PREFIX: + 'COCO_test2015_' + }, + 'coco_2017_test-dev': { # 2017 test-dev uses 2015 test images + _IM_DIR: + _DATA_DIR + '/coco/coco_test2015', + _ANN_FN: + _DATA_DIR + '/coco/annotations/image_info_test-dev2017.json', + _IM_PREFIX: + 'COCO_test2015_' + }, + 'coco_stuff_train': { + _IM_DIR: + _DATA_DIR + '/coco/coco_train2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/coco_stuff_train.json' + }, + 'coco_stuff_val': { + _IM_DIR: + _DATA_DIR + '/coco/coco_val2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/coco_stuff_val.json' + }, + 'keypoints_coco_2014_train': { + _IM_DIR: + _DATA_DIR + '/coco/coco_train2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/person_keypoints_train2014.json' + }, + 'keypoints_coco_2014_val': { + _IM_DIR: + _DATA_DIR + '/coco/coco_val2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/person_keypoints_val2014.json' + }, + 'keypoints_coco_2014_minival': { + _IM_DIR: + _DATA_DIR + '/coco/coco_val2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/person_keypoints_minival2014.json' + }, + 'keypoints_coco_2014_valminusminival': { + _IM_DIR: + _DATA_DIR + '/coco/coco_val2014', + _ANN_FN: + _DATA_DIR + '/coco/annotations/person_keypoints_valminusminival2014.json' + }, + 'keypoints_coco_2015_test': { + _IM_DIR: + _DATA_DIR + '/coco/coco_test2015', + _ANN_FN: + _DATA_DIR + '/coco/annotations/image_info_test2015.json' + }, + 'keypoints_coco_2015_test-dev': { + _IM_DIR: + _DATA_DIR + '/coco/coco_test2015', + _ANN_FN: + _DATA_DIR + '/coco/annotations/image_info_test-dev2015.json' + }, + 'voc_2007_train': { + _IM_DIR: + _DATA_DIR + '/VOC2007/JPEGImages', + _ANN_FN: + _DATA_DIR + '/VOC2007/annotations/voc_2007_train.json', + _DEVKIT_DIR: + _DATA_DIR + '/VOC2007/VOCdevkit2007' + }, + 'voc_2007_val': { + _IM_DIR: + _DATA_DIR + '/VOC2007/JPEGImages', + _ANN_FN: + _DATA_DIR + '/VOC2007/annotations/voc_2007_val.json', + _DEVKIT_DIR: + _DATA_DIR + '/VOC2007/VOCdevkit2007' + }, + 'voc_2007_test': { + _IM_DIR: + _DATA_DIR + '/VOC2007/JPEGImages', + _ANN_FN: + _DATA_DIR + '/VOC2007/annotations/voc_2007_test.json', + _DEVKIT_DIR: + _DATA_DIR + '/VOC2007/VOCdevkit2007' + }, + 'voc_2012_train': { + _IM_DIR: + _DATA_DIR + '/VOC2012/JPEGImages', + _ANN_FN: + _DATA_DIR + '/VOC2012/annotations/voc_2012_train.json', + _DEVKIT_DIR: + _DATA_DIR + '/VOC2012/VOCdevkit2012' + }, + 'voc_2012_val': { + _IM_DIR: + _DATA_DIR + '/VOC2012/JPEGImages', + _ANN_FN: + _DATA_DIR + '/VOC2012/annotations/voc_2012_val.json', + _DEVKIT_DIR: + _DATA_DIR + '/VOC2012/VOCdevkit2012' + } +} + + +def datasets(): + """Retrieve the list of available dataset names.""" + return _DATASETS.keys() + + +def contains(name): + """Determine if the dataset is in the catalog.""" + return name in _DATASETS.keys() + + +def get_im_dir(name): + """Retrieve the image directory for the dataset.""" + return _DATASETS[name][_IM_DIR] + + +def get_ann_fn(name): + """Retrieve the annotation file for the dataset.""" + return _DATASETS[name][_ANN_FN] + + +def get_im_prefix(name): + """Retrieve the image prefix for the dataset.""" + return _DATASETS[name][_IM_PREFIX] if _IM_PREFIX in _DATASETS[name] else '' + + +def get_devkit_dir(name): + """Retrieve the devkit dir for the dataset.""" + return _DATASETS[name][_DEVKIT_DIR] + + +def get_raw_dir(name): + """Retrieve the raw dir for the dataset.""" + return _DATASETS[name][_RAW_DIR] diff --git a/detectron/datasets/dummy_datasets.py b/detectron/datasets/dummy_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..2f96b8ad3b1060f48dda02a435b221321e537e4a --- /dev/null +++ b/detectron/datasets/dummy_datasets.py @@ -0,0 +1,47 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +"""Provide stub objects that can act as stand-in "dummy" datasets for simple use +cases, like getting all classes in a dataset. This exists so that demos can be +run without requiring users to download/install datasets first. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.utils.collections import AttrDict + + +def get_coco_dataset(): + """A dummy COCO dataset that includes only the 'classes' field.""" + ds = AttrDict() + classes = [ + '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', + 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', + 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', + 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', + 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', + 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', + 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', + 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', + 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', + 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy bear', 'hair drier', 'toothbrush' + ] + ds.classes = {i: name for i, name in enumerate(classes)} + return ds diff --git a/detectron/datasets/json_dataset.py b/detectron/datasets/json_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..71aacc1e5ed1abf0c0f64e5e7517661bdcd8e24d --- /dev/null +++ b/detectron/datasets/json_dataset.py @@ -0,0 +1,465 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Representation of the standard COCO json dataset format. + +When working with a new dataset, we strongly suggest to convert the dataset into +the COCO json format and use the existing code; it is not recommended to write +code to support new dataset formats. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy +import logging +import numpy as np +import os +import scipy.sparse + +# Must happen before importing COCO API (which imports matplotlib) +import detectron.utils.env as envu +envu.set_up_matplotlib() +# COCO API +from pycocotools import mask as COCOmask +from pycocotools.coco import COCO + +from detectron.core.config import cfg +from detectron.utils.timer import Timer +import detectron.datasets.dataset_catalog as dataset_catalog +import detectron.utils.boxes as box_utils +from detectron.utils.io import load_object +import detectron.utils.segms as segm_utils + +logger = logging.getLogger(__name__) + + +class JsonDataset(object): + """A class representing a COCO json dataset.""" + + def __init__(self, name): + assert dataset_catalog.contains(name), \ + 'Unknown dataset name: {}'.format(name) + assert os.path.exists(dataset_catalog.get_im_dir(name)), \ + 'Im dir \'{}\' not found'.format(dataset_catalog.get_im_dir(name)) + assert os.path.exists(dataset_catalog.get_ann_fn(name)), \ + 'Ann fn \'{}\' not found'.format(dataset_catalog.get_ann_fn(name)) + logger.debug('Creating: {}'.format(name)) + self.name = name + self.image_directory = dataset_catalog.get_im_dir(name) + self.image_prefix = dataset_catalog.get_im_prefix(name) + self.COCO = COCO(dataset_catalog.get_ann_fn(name)) + self.debug_timer = Timer() + # Set up dataset classes + category_ids = self.COCO.getCatIds() + categories = [c['name'] for c in self.COCO.loadCats(category_ids)] + self.category_to_id_map = dict(zip(categories, category_ids)) + self.classes = ['__background__'] + categories + self.num_classes = len(self.classes) + self.json_category_id_to_contiguous_id = { + v: i + 1 + for i, v in enumerate(self.COCO.getCatIds()) + } + self.contiguous_category_id_to_json_id = { + v: k + for k, v in self.json_category_id_to_contiguous_id.items() + } + self._init_keypoints() + + def get_roidb( + self, + gt=False, + proposal_file=None, + min_proposal_size=2, + proposal_limit=-1, + crowd_filter_thresh=0 + ): + """Return an roidb corresponding to the json dataset. Optionally: + - include ground truth boxes in the roidb + - add proposals specified in a proposals file + - filter proposals based on a minimum side length + - filter proposals that intersect with crowd regions + """ + assert gt is True or crowd_filter_thresh == 0, \ + 'Crowd filter threshold must be 0 if ground-truth annotations ' \ + 'are not included.' + image_ids = self.COCO.getImgIds() + image_ids.sort() + roidb = copy.deepcopy(self.COCO.loadImgs(image_ids)) + for entry in roidb: + self._prep_roidb_entry(entry) + if gt: + # Include ground-truth object annotations + self.debug_timer.tic() + for entry in roidb: + self._add_gt_annotations(entry) + logger.debug( + '_add_gt_annotations took {:.3f}s'. + format(self.debug_timer.toc(average=False)) + ) + if proposal_file is not None: + # Include proposals from a file + self.debug_timer.tic() + self._add_proposals_from_file( + roidb, proposal_file, min_proposal_size, proposal_limit, + crowd_filter_thresh + ) + logger.debug( + '_add_proposals_from_file took {:.3f}s'. + format(self.debug_timer.toc(average=False)) + ) + _add_class_assignments(roidb) + return roidb + + def _prep_roidb_entry(self, entry): + """Adds empty metadata fields to an roidb entry.""" + # Reference back to the parent dataset + entry['dataset'] = self + # Make file_name an abs path + im_path = os.path.join( + self.image_directory, self.image_prefix + entry['file_name'] + ) + assert os.path.exists(im_path), 'Image \'{}\' not found'.format(im_path) + entry['image'] = im_path + entry['flipped'] = False + entry['has_visible_keypoints'] = False + # Empty placeholders + entry['boxes'] = np.empty((0, 4), dtype=np.float32) + entry['segms'] = [] + entry['gt_classes'] = np.empty((0), dtype=np.int32) + entry['seg_areas'] = np.empty((0), dtype=np.float32) + entry['gt_overlaps'] = scipy.sparse.csr_matrix( + np.empty((0, self.num_classes), dtype=np.float32) + ) + entry['is_crowd'] = np.empty((0), dtype=np.bool) + # 'box_to_gt_ind_map': Shape is (#rois). Maps from each roi to the index + # in the list of rois that satisfy np.where(entry['gt_classes'] > 0) + entry['box_to_gt_ind_map'] = np.empty((0), dtype=np.int32) + if self.keypoints is not None: + entry['gt_keypoints'] = np.empty( + (0, 3, self.num_keypoints), dtype=np.int32 + ) + # Remove unwanted fields that come from the json file (if they exist) + for k in ['date_captured', 'url', 'license', 'file_name']: + if k in entry: + del entry[k] + + def _add_gt_annotations(self, entry): + """Add ground truth annotation metadata to an roidb entry.""" + ann_ids = self.COCO.getAnnIds(imgIds=entry['id'], iscrowd=None) + objs = self.COCO.loadAnns(ann_ids) + # Sanitize bboxes -- some are invalid + valid_objs = [] + valid_segms = [] + width = entry['width'] + height = entry['height'] + for obj in objs: + # crowd regions are RLE encoded + if segm_utils.is_poly(obj['segmentation']): + # Valid polygons have >= 3 points, so require >= 6 coordinates + obj['segmentation'] = [ + p for p in obj['segmentation'] if len(p) >= 6 + ] + if obj['area'] < cfg.TRAIN.GT_MIN_AREA: + continue + if 'ignore' in obj and obj['ignore'] == 1: + continue + # Convert form (x1, y1, w, h) to (x1, y1, x2, y2) + x1, y1, x2, y2 = box_utils.xywh_to_xyxy(obj['bbox']) + x1, y1, x2, y2 = box_utils.clip_xyxy_to_image( + x1, y1, x2, y2, height, width + ) + # Require non-zero seg area and more than 1x1 box size + if obj['area'] > 0 and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2, y2] + valid_objs.append(obj) + valid_segms.append(obj['segmentation']) + num_valid_objs = len(valid_objs) + + boxes = np.zeros((num_valid_objs, 4), dtype=entry['boxes'].dtype) + gt_classes = np.zeros((num_valid_objs), dtype=entry['gt_classes'].dtype) + gt_overlaps = np.zeros( + (num_valid_objs, self.num_classes), + dtype=entry['gt_overlaps'].dtype + ) + seg_areas = np.zeros((num_valid_objs), dtype=entry['seg_areas'].dtype) + is_crowd = np.zeros((num_valid_objs), dtype=entry['is_crowd'].dtype) + box_to_gt_ind_map = np.zeros( + (num_valid_objs), dtype=entry['box_to_gt_ind_map'].dtype + ) + if self.keypoints is not None: + gt_keypoints = np.zeros( + (num_valid_objs, 3, self.num_keypoints), + dtype=entry['gt_keypoints'].dtype + ) + + im_has_visible_keypoints = False + for ix, obj in enumerate(valid_objs): + cls = self.json_category_id_to_contiguous_id[obj['category_id']] + boxes[ix, :] = obj['clean_bbox'] + gt_classes[ix] = cls + seg_areas[ix] = obj['area'] + is_crowd[ix] = obj['iscrowd'] + box_to_gt_ind_map[ix] = ix + if self.keypoints is not None: + gt_keypoints[ix, :, :] = self._get_gt_keypoints(obj) + if np.sum(gt_keypoints[ix, 2, :]) > 0: + im_has_visible_keypoints = True + if obj['iscrowd']: + # Set overlap to -1 for all classes for crowd objects + # so they will be excluded during training + gt_overlaps[ix, :] = -1.0 + else: + gt_overlaps[ix, cls] = 1.0 + entry['boxes'] = np.append(entry['boxes'], boxes, axis=0) + entry['segms'].extend(valid_segms) + # To match the original implementation: + # entry['boxes'] = np.append( + # entry['boxes'], boxes.astype(np.int).astype(np.float), axis=0) + entry['gt_classes'] = np.append(entry['gt_classes'], gt_classes) + entry['seg_areas'] = np.append(entry['seg_areas'], seg_areas) + entry['gt_overlaps'] = np.append( + entry['gt_overlaps'].toarray(), gt_overlaps, axis=0 + ) + entry['gt_overlaps'] = scipy.sparse.csr_matrix(entry['gt_overlaps']) + entry['is_crowd'] = np.append(entry['is_crowd'], is_crowd) + entry['box_to_gt_ind_map'] = np.append( + entry['box_to_gt_ind_map'], box_to_gt_ind_map + ) + if self.keypoints is not None: + entry['gt_keypoints'] = np.append( + entry['gt_keypoints'], gt_keypoints, axis=0 + ) + entry['has_visible_keypoints'] = im_has_visible_keypoints + + def _add_proposals_from_file( + self, roidb, proposal_file, min_proposal_size, top_k, crowd_thresh + ): + """Add proposals from a proposals file to an roidb.""" + logger.info('Loading proposals from: {}'.format(proposal_file)) + proposals = load_object(proposal_file) + + id_field = 'indexes' if 'indexes' in proposals else 'ids' # compat fix + + _remove_proposals_not_in_roidb(proposals, roidb, id_field) + _sort_proposals(proposals, id_field) + box_list = [] + for i, entry in enumerate(roidb): + if i % 2500 == 0: + logger.info(' {:d}/{:d}'.format(i + 1, len(roidb))) + boxes = proposals['boxes'][i] + # Sanity check that these boxes are for the correct image id + assert entry['id'] == proposals[id_field][i] + # Remove duplicate boxes and very small boxes and then take top k + boxes = box_utils.clip_boxes_to_image( + boxes, entry['height'], entry['width'] + ) + keep = box_utils.unique_boxes(boxes) + boxes = boxes[keep, :] + keep = box_utils.filter_small_boxes(boxes, min_proposal_size) + boxes = boxes[keep, :] + if top_k > 0: + boxes = boxes[:top_k, :] + box_list.append(boxes) + _merge_proposal_boxes_into_roidb(roidb, box_list) + if crowd_thresh > 0: + _filter_crowd_proposals(roidb, crowd_thresh) + + def _init_keypoints(self): + """Initialize COCO keypoint information.""" + self.keypoints = None + self.keypoint_flip_map = None + self.keypoints_to_id_map = None + self.num_keypoints = 0 + # Thus far only the 'person' category has keypoints + if 'person' in self.category_to_id_map: + cat_info = self.COCO.loadCats([self.category_to_id_map['person']]) + else: + return + + # Check if the annotations contain keypoint data or not + if 'keypoints' in cat_info[0]: + keypoints = cat_info[0]['keypoints'] + self.keypoints_to_id_map = dict( + zip(keypoints, range(len(keypoints)))) + self.keypoints = keypoints + self.num_keypoints = len(keypoints) + self.keypoint_flip_map = { + 'left_eye': 'right_eye', + 'left_ear': 'right_ear', + 'left_shoulder': 'right_shoulder', + 'left_elbow': 'right_elbow', + 'left_wrist': 'right_wrist', + 'left_hip': 'right_hip', + 'left_knee': 'right_knee', + 'left_ankle': 'right_ankle'} + + def _get_gt_keypoints(self, obj): + """Return ground truth keypoints.""" + if 'keypoints' not in obj: + return None + kp = np.array(obj['keypoints']) + x = kp[0::3] # 0-indexed x coordinates + y = kp[1::3] # 0-indexed y coordinates + # 0: not labeled; 1: labeled, not inside mask; + # 2: labeled and inside mask + v = kp[2::3] + num_keypoints = len(obj['keypoints']) / 3 + assert num_keypoints == self.num_keypoints + gt_kps = np.ones((3, self.num_keypoints), dtype=np.int32) + for i in range(self.num_keypoints): + gt_kps[0, i] = x[i] + gt_kps[1, i] = y[i] + gt_kps[2, i] = v[i] + return gt_kps + + +def add_proposals(roidb, rois, scales, crowd_thresh): + """Add proposal boxes (rois) to an roidb that has ground-truth annotations + but no proposals. If the proposals are not at the original image scale, + specify the scale factor that separate them in scales. + """ + box_list = [] + for i in range(len(roidb)): + inv_im_scale = 1. / scales[i] + idx = np.where(rois[:, 0] == i)[0] + box_list.append(rois[idx, 1:] * inv_im_scale) + _merge_proposal_boxes_into_roidb(roidb, box_list) + if crowd_thresh > 0: + _filter_crowd_proposals(roidb, crowd_thresh) + _add_class_assignments(roidb) + + +def _merge_proposal_boxes_into_roidb(roidb, box_list): + """Add proposal boxes to each roidb entry.""" + assert len(box_list) == len(roidb) + for i, entry in enumerate(roidb): + boxes = box_list[i] + num_boxes = boxes.shape[0] + gt_overlaps = np.zeros( + (num_boxes, entry['gt_overlaps'].shape[1]), + dtype=entry['gt_overlaps'].dtype + ) + box_to_gt_ind_map = -np.ones( + (num_boxes), dtype=entry['box_to_gt_ind_map'].dtype + ) + + # Note: unlike in other places, here we intentionally include all gt + # rois, even ones marked as crowd. Boxes that overlap with crowds will + # be filtered out later (see: _filter_crowd_proposals). + gt_inds = np.where(entry['gt_classes'] > 0)[0] + if len(gt_inds) > 0: + gt_boxes = entry['boxes'][gt_inds, :] + gt_classes = entry['gt_classes'][gt_inds] + proposal_to_gt_overlaps = box_utils.bbox_overlaps( + boxes.astype(dtype=np.float32, copy=False), + gt_boxes.astype(dtype=np.float32, copy=False) + ) + # Gt box that overlaps each input box the most + # (ties are broken arbitrarily by class order) + argmaxes = proposal_to_gt_overlaps.argmax(axis=1) + # Amount of that overlap + maxes = proposal_to_gt_overlaps.max(axis=1) + # Those boxes with non-zero overlap with gt boxes + I = np.where(maxes > 0)[0] + # Record max overlaps with the class of the appropriate gt box + gt_overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] + box_to_gt_ind_map[I] = gt_inds[argmaxes[I]] + entry['boxes'] = np.append( + entry['boxes'], + boxes.astype(entry['boxes'].dtype, copy=False), + axis=0 + ) + entry['gt_classes'] = np.append( + entry['gt_classes'], + np.zeros((num_boxes), dtype=entry['gt_classes'].dtype) + ) + entry['seg_areas'] = np.append( + entry['seg_areas'], + np.zeros((num_boxes), dtype=entry['seg_areas'].dtype) + ) + entry['gt_overlaps'] = np.append( + entry['gt_overlaps'].toarray(), gt_overlaps, axis=0 + ) + entry['gt_overlaps'] = scipy.sparse.csr_matrix(entry['gt_overlaps']) + entry['is_crowd'] = np.append( + entry['is_crowd'], + np.zeros((num_boxes), dtype=entry['is_crowd'].dtype) + ) + entry['box_to_gt_ind_map'] = np.append( + entry['box_to_gt_ind_map'], + box_to_gt_ind_map.astype( + entry['box_to_gt_ind_map'].dtype, copy=False + ) + ) + + +def _filter_crowd_proposals(roidb, crowd_thresh): + """Finds proposals that are inside crowd regions and marks them as + overlap = -1 with each ground-truth rois, which means they will be excluded + from training. + """ + for entry in roidb: + gt_overlaps = entry['gt_overlaps'].toarray() + crowd_inds = np.where(entry['is_crowd'] == 1)[0] + non_gt_inds = np.where(entry['gt_classes'] == 0)[0] + if len(crowd_inds) == 0 or len(non_gt_inds) == 0: + continue + crowd_boxes = box_utils.xyxy_to_xywh(entry['boxes'][crowd_inds, :]) + non_gt_boxes = box_utils.xyxy_to_xywh(entry['boxes'][non_gt_inds, :]) + iscrowd_flags = [int(True)] * len(crowd_inds) + ious = COCOmask.iou(non_gt_boxes, crowd_boxes, iscrowd_flags) + bad_inds = np.where(ious.max(axis=1) > crowd_thresh)[0] + gt_overlaps[non_gt_inds[bad_inds], :] = -1 + entry['gt_overlaps'] = scipy.sparse.csr_matrix(gt_overlaps) + + +def _add_class_assignments(roidb): + """Compute object category assignment for each box associated with each + roidb entry. + """ + for entry in roidb: + gt_overlaps = entry['gt_overlaps'].toarray() + # max overlap with gt over classes (columns) + max_overlaps = gt_overlaps.max(axis=1) + # gt class that had the max overlap + max_classes = gt_overlaps.argmax(axis=1) + entry['max_classes'] = max_classes + entry['max_overlaps'] = max_overlaps + # sanity checks + # if max overlap is 0, the class must be background (class 0) + zero_inds = np.where(max_overlaps == 0)[0] + assert all(max_classes[zero_inds] == 0) + # if max overlap > 0, the class must be a fg class (not class 0) + nonzero_inds = np.where(max_overlaps > 0)[0] + assert all(max_classes[nonzero_inds] != 0) + + +def _sort_proposals(proposals, id_field): + """Sort proposals by the specified id field.""" + order = np.argsort(proposals[id_field]) + fields_to_sort = ['boxes', id_field, 'scores'] + for k in fields_to_sort: + proposals[k] = [proposals[k][i] for i in order] + + +def _remove_proposals_not_in_roidb(proposals, roidb, id_field): + # fix proposals so they don't contain entries for images not in the roidb + roidb_ids = set({entry["id"] for entry in roidb}) + keep = [i for i, id in enumerate(proposals[id_field]) if id in roidb_ids] + for f in ['boxes', id_field, 'scores']: + proposals[f] = [proposals[f][i] for i in keep] diff --git a/detectron/datasets/json_dataset_evaluator.py b/detectron/datasets/json_dataset_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..5b47f0bd9da98463b13b6db635c5c14b7d630d89 --- /dev/null +++ b/detectron/datasets/json_dataset_evaluator.py @@ -0,0 +1,471 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Functions for evaluating results computed for a json dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import json +import logging +import numpy as np +import os +import six +import uuid + +from pycocotools.cocoeval import COCOeval + +from detectron.core.config import cfg +from detectron.utils.io import save_object +import detectron.utils.boxes as box_utils + +logger = logging.getLogger(__name__) + + +def evaluate_masks( + json_dataset, + all_boxes, + all_segms, + output_dir, + use_salt=True, + cleanup=False +): + res_file = os.path.join( + output_dir, 'segmentations_' + json_dataset.name + '_results' + ) + if use_salt: + res_file += '_{}'.format(str(uuid.uuid4())) + res_file += '.json' + _write_coco_segms_results_file( + json_dataset, all_boxes, all_segms, res_file) + # Only do evaluation on non-test sets (annotations are undisclosed on test) + if json_dataset.name.find('test') == -1: + coco_eval = _do_segmentation_eval(json_dataset, res_file, output_dir) + else: + logger.warning( + '{} eval ignored as annotations are undisclosed on test: {} ignored' + .format("Segmentation", json_dataset.name) + ) + coco_eval = None + # Optionally cleanup results json file + if cleanup: + os.remove(res_file) + return coco_eval + + +def _write_coco_segms_results_file( + json_dataset, all_boxes, all_segms, res_file +): + # [{"image_id": 42, + # "category_id": 18, + # "segmentation": [...], + # "score": 0.236}, ...] + results = [] + for cls_ind, cls in enumerate(json_dataset.classes): + if cls == '__background__': + continue + if cls_ind >= len(all_boxes): + break + cat_id = json_dataset.category_to_id_map[cls] + results.extend(_coco_segms_results_one_category( + json_dataset, all_boxes[cls_ind], all_segms[cls_ind], cat_id)) + logger.info( + 'Writing segmentation results json to: {}'.format( + os.path.abspath(res_file))) + with open(res_file, 'w') as fid: + # "counts" is an array encoded by mask_util as a byte-stream. Python3's + # json writer which /always produces strings/ cannot serialize a bytestream + # unless you decode it. Thankfully, utf-8 works out (which is also what + # the pycocotools/_mask.pyx does. + if six.PY3: + for r in results: + rle = r['segmentation'] + if 'counts' in rle: + rle['counts'] = rle['counts'].decode("utf8") + + json.dump(results, fid) + + +def _coco_segms_results_one_category(json_dataset, boxes, segms, cat_id): + results = [] + image_ids = json_dataset.COCO.getImgIds() + image_ids.sort() + assert len(boxes) == len(image_ids) + assert len(segms) == len(image_ids) + for i, image_id in enumerate(image_ids): + dets = boxes[i] + rles = segms[i] + + if isinstance(dets, list) and len(dets) == 0: + continue + + dets = dets.astype(np.float) + scores = dets[:, -1] + + results.extend( + [{'image_id': image_id, + 'category_id': cat_id, + 'segmentation': rles[k], + 'score': scores[k]} + for k in range(dets.shape[0])]) + + return results + + +def _do_segmentation_eval(json_dataset, res_file, output_dir): + coco_dt = json_dataset.COCO.loadRes(str(res_file)) + coco_eval = COCOeval(json_dataset.COCO, coco_dt, 'segm') + coco_eval.evaluate() + coco_eval.accumulate() + _log_detection_eval_metrics(json_dataset, coco_eval) + eval_file = os.path.join(output_dir, 'segmentation_results.pkl') + save_object(coco_eval, eval_file) + logger.info('Wrote json eval results to: {}'.format(eval_file)) + return coco_eval + + +def evaluate_boxes( + json_dataset, all_boxes, output_dir, use_salt=True, cleanup=False +): + res_file = os.path.join( + output_dir, 'bbox_' + json_dataset.name + '_results' + ) + if use_salt: + res_file += '_{}'.format(str(uuid.uuid4())) + res_file += '.json' + _write_coco_bbox_results_file(json_dataset, all_boxes, res_file) + # Only do evaluation on non-test sets (annotations are undisclosed on test) + if json_dataset.name.find('test') == -1: + coco_eval = _do_detection_eval(json_dataset, res_file, output_dir) + else: + logger.warning( + '{} eval ignored as annotations are undisclosed on test: {} ignored' + .format("Bbox", json_dataset.name) + ) + coco_eval = None + # Optionally cleanup results json file + if cleanup: + os.remove(res_file) + return coco_eval + + +def _write_coco_bbox_results_file(json_dataset, all_boxes, res_file): + # [{"image_id": 42, + # "category_id": 18, + # "bbox": [258.15,41.29,348.26,243.78], + # "score": 0.236}, ...] + results = [] + for cls_ind, cls in enumerate(json_dataset.classes): + if cls == '__background__': + continue + if cls_ind >= len(all_boxes): + break + cat_id = json_dataset.category_to_id_map[cls] + results.extend(_coco_bbox_results_one_category( + json_dataset, all_boxes[cls_ind], cat_id)) + logger.info( + 'Writing bbox results json to: {}'.format(os.path.abspath(res_file))) + with open(res_file, 'w') as fid: + json.dump(results, fid) + + +def _coco_bbox_results_one_category(json_dataset, boxes, cat_id): + results = [] + image_ids = json_dataset.COCO.getImgIds() + image_ids.sort() + assert len(boxes) == len(image_ids) + for i, image_id in enumerate(image_ids): + dets = boxes[i] + if isinstance(dets, list) and len(dets) == 0: + continue + dets = dets.astype(np.float) + scores = dets[:, -1] + xywh_dets = box_utils.xyxy_to_xywh(dets[:, 0:4]) + xs = xywh_dets[:, 0] + ys = xywh_dets[:, 1] + ws = xywh_dets[:, 2] + hs = xywh_dets[:, 3] + results.extend( + [{'image_id': image_id, + 'category_id': cat_id, + 'bbox': [xs[k], ys[k], ws[k], hs[k]], + 'score': scores[k]} for k in range(dets.shape[0])]) + return results + + +def _do_detection_eval(json_dataset, res_file, output_dir): + coco_dt = json_dataset.COCO.loadRes(str(res_file)) + coco_eval = COCOeval(json_dataset.COCO, coco_dt, 'bbox') + coco_eval.evaluate() + coco_eval.accumulate() + _log_detection_eval_metrics(json_dataset, coco_eval) + eval_file = os.path.join(output_dir, 'detection_results.pkl') + save_object(coco_eval, eval_file) + logger.info('Wrote json eval results to: {}'.format(eval_file)) + return coco_eval + + +def _log_detection_eval_metrics(json_dataset, coco_eval): + def _get_thr_ind(coco_eval, thr): + ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) & + (coco_eval.params.iouThrs < thr + 1e-5))[0][0] + iou_thr = coco_eval.params.iouThrs[ind] + assert np.isclose(iou_thr, thr) + return ind + + IoU_lo_thresh = 0.5 + IoU_hi_thresh = 0.95 + ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh) + ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh) + # precision has dims (iou, recall, cls, area range, max dets) + # area range index 0: all area ranges + # max dets index 2: 100 per image + precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2] + ap_default = np.mean(precision[precision > -1]) + logger.info( + '~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] ~~~~'.format( + IoU_lo_thresh, IoU_hi_thresh)) + logger.info('{:.1f}'.format(100 * ap_default)) + for cls_ind, cls in enumerate(json_dataset.classes): + if cls == '__background__': + continue + # minus 1 because of __background__ + precision = coco_eval.eval['precision'][ + ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2] + ap = np.mean(precision[precision > -1]) + logger.info('{:.1f}'.format(100 * ap)) + logger.info('~~~~ Summary metrics ~~~~') + coco_eval.summarize() + + +def evaluate_box_proposals( + json_dataset, roidb, thresholds=None, area='all', limit=None, class_specific=False +): + """Evaluate detection proposal recall metrics. This function is a much + faster alternative to the official COCO API recall evaluation code. However, + it produces slightly different results. + """ + # Record max overlap value for each gt box + # Return vector of overlap values + areas = { + 'all': 0, + 'small': 1, + 'medium': 2, + 'large': 3, + '96-128': 4, + '128-256': 5, + '256-512': 6, + '512-inf': 7} + area_ranges = [ + [0**2, 1e5**2], # all + [0**2, 32**2], # small + [32**2, 96**2], # medium + [96**2, 1e5**2], # large + [96**2, 128**2], # 96-128 + [128**2, 256**2], # 128-256 + [256**2, 512**2], # 256-512 + [512**2, 1e5**2]] # 512-inf + assert area in areas, 'Unknown area range: {}'.format(area) + area_range = area_ranges[areas[area]] + gt_overlaps = np.zeros(0) + gt_classes = np.zeros(0) + num_pos = 0 + for entry in roidb: + gt_inds = np.where( + (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] + gt_boxes = entry['boxes'][gt_inds, :] + gt_areas = entry['seg_areas'][gt_inds] + valid_gt_inds = np.where( + (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]))[0] + gt_boxes = gt_boxes[valid_gt_inds, :] + _gt_classes = entry["gt_classes"][valid_gt_inds] + assert gt_boxes.shape[0] == _gt_classes.shape[0] + gt_classes = np.hstack((gt_classes, _gt_classes)) + num_pos += len(valid_gt_inds) + non_gt_inds = np.where(entry['gt_classes'] == 0)[0] + boxes = entry['boxes'][non_gt_inds, :] + if boxes.shape[0] == 0: + continue + if limit is not None and boxes.shape[0] > limit: + boxes = boxes[:limit, :] + overlaps = box_utils.bbox_overlaps( + boxes.astype(dtype=np.float32, copy=False), + gt_boxes.astype(dtype=np.float32, copy=False)) + _gt_overlaps = np.zeros((gt_boxes.shape[0])) + for j in range(min(boxes.shape[0], gt_boxes.shape[0])): + # find which proposal box maximally covers each gt box + argmax_overlaps = overlaps.argmax(axis=0) + # and get the iou amount of coverage for each gt box + max_overlaps = overlaps.max(axis=0) + # find which gt box is 'best' covered (i.e. 'best' = most iou) + gt_ind = max_overlaps.argmax() + gt_ovr = max_overlaps.max() + assert gt_ovr >= 0 + # find the proposal box that covers the best covered gt box + box_ind = argmax_overlaps[gt_ind] + # record the iou coverage of this gt box + _gt_overlaps[j] = overlaps[box_ind, gt_ind] + assert _gt_overlaps[j] == gt_ovr + # mark the proposal box and the gt box as used + overlaps[box_ind, :] = -1 + overlaps[:, gt_ind] = -1 + # append recorded iou coverage level + gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) + + if thresholds is None: + step = 0.05 + thresholds = np.arange(0.5, 0.95 + 1e-5, step) + + if not class_specific: + gt_overlaps = np.sort(gt_overlaps) + recalls = np.zeros_like(thresholds) + # compute recall for each iou threshold + for i, t in enumerate(thresholds): + recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) + ar = recalls.mean() + return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds, + 'gt_overlaps': gt_overlaps, 'num_pos': num_pos} + else: + gt_classes_unique = np.unique(gt_classes) + recalls = np.zeros((gt_classes_unique.shape[0], thresholds.shape[0])) + # compute recall for each category and each iou threshold + for i, category_id in enumerate(gt_classes_unique): + inds = (gt_classes == category_id) + num_pos_per_category = float(inds.sum()) + for j, thresh in enumerate(thresholds): + recalls[i][j] = ( + gt_overlaps[inds] >= thresh + ).sum() / num_pos_per_category + ar = recalls.mean(axis=1).mean() + return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds, + 'gt_overlaps': gt_overlaps, 'num_pos': num_pos} + +def evaluate_keypoints( + json_dataset, + all_boxes, + all_keypoints, + output_dir, + use_salt=True, + cleanup=False +): + res_file = os.path.join( + output_dir, 'keypoints_' + json_dataset.name + '_results' + ) + if use_salt: + res_file += '_{}'.format(str(uuid.uuid4())) + res_file += '.json' + _write_coco_keypoint_results_file( + json_dataset, all_boxes, all_keypoints, res_file) + # Only do evaluation on non-test sets (annotations are undisclosed on test) + if json_dataset.name.find('test') == -1: + coco_eval = _do_keypoint_eval(json_dataset, res_file, output_dir) + else: + logger.warning( + '{} eval ignored as annotations are undisclosed on test: {} ignored' + .format("Keypoints", json_dataset.name) + ) + coco_eval = None + # Optionally cleanup results json file + if cleanup: + os.remove(res_file) + return coco_eval + + +def _write_coco_keypoint_results_file( + json_dataset, all_boxes, all_keypoints, res_file +): + results = [] + for cls_ind, cls in enumerate(json_dataset.classes): + if cls == '__background__': + continue + if cls_ind >= len(all_keypoints): + break + logger.info( + 'Collecting {} results ({:d}/{:d})'.format( + cls, cls_ind, len(all_keypoints) - 1)) + cat_id = json_dataset.category_to_id_map[cls] + results.extend(_coco_kp_results_one_category( + json_dataset, all_boxes[cls_ind], all_keypoints[cls_ind], cat_id)) + logger.info( + 'Writing keypoint results json to: {}'.format( + os.path.abspath(res_file))) + with open(res_file, 'w') as fid: + json.dump(results, fid) + + +def _coco_kp_results_one_category(json_dataset, boxes, kps, cat_id): + results = [] + image_ids = json_dataset.COCO.getImgIds() + image_ids.sort() + assert len(kps) == len(image_ids) + assert len(boxes) == len(image_ids) + use_box_score = False + if cfg.KRCNN.KEYPOINT_CONFIDENCE == 'logit': + # This is ugly; see utils.keypoints.heatmap_to_keypoints for the magic + # indexes + score_index = 2 + elif cfg.KRCNN.KEYPOINT_CONFIDENCE == 'prob': + score_index = 3 + elif cfg.KRCNN.KEYPOINT_CONFIDENCE == 'bbox': + use_box_score = True + else: + raise ValueError( + 'KRCNN.KEYPOINT_CONFIDENCE must be "logit", "prob", or "bbox"') + for i, image_id in enumerate(image_ids): + if len(boxes[i]) == 0: + continue + kps_dets = kps[i] + scores = boxes[i][:, -1].astype(np.float) + if len(kps_dets) == 0: + continue + for j in range(len(kps_dets)): + xy = [] + + kps_score = 0 + for k in range(kps_dets[j].shape[1]): + xy.append(float(kps_dets[j][0, k])) + xy.append(float(kps_dets[j][1, k])) + xy.append(1) + if not use_box_score: + kps_score += kps_dets[j][score_index, k] + + if use_box_score: + kps_score = scores[j] + else: + kps_score /= kps_dets[j].shape[1] + + results.extend([{'image_id': image_id, + 'category_id': cat_id, + 'keypoints': xy, + 'score': kps_score}]) + return results + + +def _do_keypoint_eval(json_dataset, res_file, output_dir): + ann_type = 'keypoints' + imgIds = json_dataset.COCO.getImgIds() + imgIds.sort() + coco_dt = json_dataset.COCO.loadRes(res_file) + coco_eval = COCOeval(json_dataset.COCO, coco_dt, ann_type) + coco_eval.params.imgIds = imgIds + coco_eval.evaluate() + coco_eval.accumulate() + eval_file = os.path.join(output_dir, 'keypoint_results.pkl') + save_object(coco_eval, eval_file) + logger.info('Wrote json eval results to: {}'.format(eval_file)) + coco_eval.summarize() + return coco_eval diff --git a/detectron/datasets/roidb.py b/detectron/datasets/roidb.py new file mode 100644 index 0000000000000000000000000000000000000000..57b6e9cfeedc12d822e0b9ec46e58ff9a4bbb288 --- /dev/null +++ b/detectron/datasets/roidb.py @@ -0,0 +1,199 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Functions for common roidb manipulations.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from past.builtins import basestring +import logging +import numpy as np + +from detectron.core.config import cfg +from detectron.datasets.json_dataset import JsonDataset +import detectron.utils.boxes as box_utils +import detectron.utils.keypoints as keypoint_utils +import detectron.utils.segms as segm_utils + +logger = logging.getLogger(__name__) + + +def combined_roidb_for_training(dataset_names, proposal_files): + """Load and concatenate roidbs for one or more datasets, along with optional + object proposals. The roidb entries are then prepared for use in training, + which involves caching certain types of metadata for each roidb entry. + """ + def get_roidb(dataset_name, proposal_file): + ds = JsonDataset(dataset_name) + roidb = ds.get_roidb( + gt=True, + proposal_file=proposal_file, + crowd_filter_thresh=cfg.TRAIN.CROWD_FILTER_THRESH + ) + if cfg.TRAIN.USE_FLIPPED: + logger.info('Appending horizontally-flipped training examples...') + extend_with_flipped_entries(roidb, ds) + logger.info('Loaded dataset: {:s}'.format(ds.name)) + return roidb + + if isinstance(dataset_names, basestring): + dataset_names = (dataset_names, ) + if isinstance(proposal_files, basestring): + proposal_files = (proposal_files, ) + if len(proposal_files) == 0: + proposal_files = (None, ) * len(dataset_names) + assert len(dataset_names) == len(proposal_files) + roidbs = [get_roidb(*args) for args in zip(dataset_names, proposal_files)] + roidb = roidbs[0] + for r in roidbs[1:]: + roidb.extend(r) + roidb = filter_for_training(roidb) + + logger.info('Computing bounding-box regression targets...') + add_bbox_regression_targets(roidb) + logger.info('done') + + _compute_and_log_stats(roidb) + + return roidb + + +def extend_with_flipped_entries(roidb, dataset): + """Flip each entry in the given roidb and return a new roidb that is the + concatenation of the original roidb and the flipped entries. + + "Flipping" an entry means that that image and associated metadata (e.g., + ground truth boxes and object proposals) are horizontally flipped. + """ + flipped_roidb = [] + for entry in roidb: + width = entry['width'] + boxes = entry['boxes'].copy() + oldx1 = boxes[:, 0].copy() + oldx2 = boxes[:, 2].copy() + boxes[:, 0] = width - oldx2 - 1 + boxes[:, 2] = width - oldx1 - 1 + assert (boxes[:, 2] >= boxes[:, 0]).all() + flipped_entry = {} + dont_copy = ('boxes', 'segms', 'gt_keypoints', 'flipped') + for k, v in entry.items(): + if k not in dont_copy: + flipped_entry[k] = v + flipped_entry['boxes'] = boxes + flipped_entry['segms'] = segm_utils.flip_segms( + entry['segms'], entry['height'], entry['width'] + ) + if dataset.keypoints is not None: + flipped_entry['gt_keypoints'] = keypoint_utils.flip_keypoints( + dataset.keypoints, dataset.keypoint_flip_map, + entry['gt_keypoints'], entry['width'] + ) + flipped_entry['flipped'] = True + flipped_roidb.append(flipped_entry) + roidb.extend(flipped_roidb) + + +def filter_for_training(roidb): + """Remove roidb entries that have no usable RoIs based on config settings. + """ + def is_valid(entry): + # Valid images have: + # (1) At least one foreground RoI OR + # (2) At least one background RoI + overlaps = entry['max_overlaps'] + # find boxes with sufficient overlap + fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] + # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) + bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & + (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] + # image is only valid if such boxes exist + valid = len(fg_inds) > 0 or len(bg_inds) > 0 + if cfg.MODEL.KEYPOINTS_ON: + # If we're training for keypoints, exclude images with no keypoints + valid = valid and entry['has_visible_keypoints'] + return valid + + num = len(roidb) + filtered_roidb = [entry for entry in roidb if is_valid(entry)] + num_after = len(filtered_roidb) + logger.info('Filtered {} roidb entries: {} -> {}'. + format(num - num_after, num, num_after)) + return filtered_roidb + + +def add_bbox_regression_targets(roidb): + """Add information needed to train bounding-box regressors.""" + for entry in roidb: + entry['bbox_targets'] = compute_bbox_regression_targets(entry) + + +def compute_bbox_regression_targets(entry): + """Compute bounding-box regression targets for an image.""" + # Indices of ground-truth ROIs + rois = entry['boxes'] + overlaps = entry['max_overlaps'] + labels = entry['max_classes'] + gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] + # Targets has format (class, tx, ty, tw, th) + targets = np.zeros((rois.shape[0], 5), dtype=np.float32) + if len(gt_inds) == 0: + # Bail if the image has no ground-truth ROIs + return targets + + # Indices of examples for which we try to make predictions + ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] + + # Get IoU overlap between each ex ROI and gt ROI + ex_gt_overlaps = box_utils.bbox_overlaps( + rois[ex_inds, :].astype(dtype=np.float32, copy=False), + rois[gt_inds, :].astype(dtype=np.float32, copy=False)) + + # Find which gt ROI each ex ROI has max overlap with: + # this will be the ex ROI's gt target + gt_assignment = ex_gt_overlaps.argmax(axis=1) + gt_rois = rois[gt_inds[gt_assignment], :] + ex_rois = rois[ex_inds, :] + # Use class "1" for all boxes if using class_agnostic_bbox_reg + targets[ex_inds, 0] = ( + 1 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else labels[ex_inds]) + targets[ex_inds, 1:] = box_utils.bbox_transform_inv( + ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) + return targets + + +def _compute_and_log_stats(roidb): + classes = roidb[0]['dataset'].classes + char_len = np.max([len(c) for c in classes]) + hist_bins = np.arange(len(classes) + 1) + + # Histogram of ground-truth objects + gt_hist = np.zeros((len(classes)), dtype=np.int) + for entry in roidb: + gt_inds = np.where( + (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] + gt_classes = entry['gt_classes'][gt_inds] + gt_hist += np.histogram(gt_classes, bins=hist_bins)[0] + logger.debug('Ground-truth class histogram:') + for i, v in enumerate(gt_hist): + logger.debug( + '{:d}{:s}: {:d}'.format( + i, classes[i].rjust(char_len), v)) + logger.debug('-' * char_len) + logger.debug( + '{:s}: {:d}'.format( + 'total'.rjust(char_len), np.sum(gt_hist))) diff --git a/detectron/datasets/task_evaluation.py b/detectron/datasets/task_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..9caefe559b8de2cbb2aefdd82d3f567695633056 --- /dev/null +++ b/detectron/datasets/task_evaluation.py @@ -0,0 +1,411 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Evaluation interface for supported tasks (box detection, instance +segmentation, keypoint detection, ...). + + +Results are stored in an OrderedDict with the following nested structure: + +: + : + : + + is any valid dataset (e.g., 'coco_2014_minival') + is in ['box', 'mask', 'keypoint', 'box_proposal'] + can be ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'AR@1000', + 'ARs@1000', 'ARm@1000', 'ARl@1000', ...] + is a floating point number +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import OrderedDict +import logging +import os +import pprint + +from detectron.core.config import cfg +from detectron.utils.logging import send_email +import detectron.datasets.cityscapes_json_dataset_evaluator \ + as cs_json_dataset_evaluator +import detectron.datasets.json_dataset_evaluator as json_dataset_evaluator +import detectron.datasets.voc_dataset_evaluator as voc_dataset_evaluator + +logger = logging.getLogger(__name__) + + +def evaluate_all( + dataset, all_boxes, all_segms, all_keyps, output_dir, use_matlab=False +): + """Evaluate "all" tasks, where "all" includes box detection, instance + segmentation, and keypoint detection. + """ + all_results = evaluate_boxes( + dataset, all_boxes, output_dir, use_matlab=use_matlab + ) + logger.info('Evaluating bounding boxes is done!') + if cfg.MODEL.MASK_ON: + results = evaluate_masks(dataset, all_boxes, all_segms, output_dir) + all_results[dataset.name].update(results[dataset.name]) + logger.info('Evaluating segmentations is done!') + if cfg.MODEL.KEYPOINTS_ON: + results = evaluate_keypoints(dataset, all_boxes, all_keyps, output_dir) + all_results[dataset.name].update(results[dataset.name]) + logger.info('Evaluating keypoints is done!') + return all_results + + +def evaluate_boxes(dataset, all_boxes, output_dir, use_matlab=False): + """Evaluate bounding box detection.""" + logger.info('Evaluating detections') + not_comp = not cfg.TEST.COMPETITION_MODE + if _use_json_dataset_evaluator(dataset): + coco_eval = json_dataset_evaluator.evaluate_boxes( + dataset, all_boxes, output_dir, use_salt=not_comp, cleanup=not_comp + ) + box_results = _coco_eval_to_box_results(coco_eval) + elif _use_cityscapes_evaluator(dataset): + logger.warn('Cityscapes bbox evaluated using COCO metrics/conversions') + coco_eval = json_dataset_evaluator.evaluate_boxes( + dataset, all_boxes, output_dir, use_salt=not_comp, cleanup=not_comp + ) + box_results = _coco_eval_to_box_results(coco_eval) + elif _use_voc_evaluator(dataset): + # For VOC, always use salt and always cleanup because results are + # written to the shared VOCdevkit results directory + voc_eval = voc_dataset_evaluator.evaluate_boxes( + dataset, all_boxes, output_dir, use_matlab=use_matlab + ) + box_results = _voc_eval_to_box_results(voc_eval) + else: + raise NotImplementedError( + 'No evaluator for dataset: {}'.format(dataset.name) + ) + return OrderedDict([(dataset.name, box_results)]) + + +def evaluate_masks(dataset, all_boxes, all_segms, output_dir): + """Evaluate instance segmentation.""" + logger.info('Evaluating segmentations') + not_comp = not cfg.TEST.COMPETITION_MODE + if _use_json_dataset_evaluator(dataset): + coco_eval = json_dataset_evaluator.evaluate_masks( + dataset, + all_boxes, + all_segms, + output_dir, + use_salt=not_comp, + cleanup=not_comp + ) + mask_results = _coco_eval_to_mask_results(coco_eval) + elif _use_cityscapes_evaluator(dataset): + cs_eval = cs_json_dataset_evaluator.evaluate_masks( + dataset, + all_boxes, + all_segms, + output_dir, + use_salt=not_comp, + cleanup=not_comp + ) + mask_results = _cs_eval_to_mask_results(cs_eval) + else: + raise NotImplementedError( + 'No evaluator for dataset: {}'.format(dataset.name) + ) + return OrderedDict([(dataset.name, mask_results)]) + + +def evaluate_keypoints(dataset, all_boxes, all_keyps, output_dir): + """Evaluate human keypoint detection (i.e., 2D pose estimation).""" + logger.info('Evaluating detections') + not_comp = not cfg.TEST.COMPETITION_MODE + assert dataset.name.startswith('keypoints_coco_'), \ + 'Only COCO keypoints are currently supported' + coco_eval = json_dataset_evaluator.evaluate_keypoints( + dataset, + all_boxes, + all_keyps, + output_dir, + use_salt=not_comp, + cleanup=not_comp + ) + keypoint_results = _coco_eval_to_keypoint_results(coco_eval) + return OrderedDict([(dataset.name, keypoint_results)]) + + +def evaluate_box_proposals(dataset, roidb): + """Evaluate bounding box object proposals.""" + res = _empty_box_proposal_results() + areas = {'all': '', 'small': 's', 'medium': 'm', 'large': 'l'} + for limit in [100, 1000]: + for area, suffix in areas.items(): + stats = json_dataset_evaluator.evaluate_box_proposals( + dataset, + roidb, + area=area, + limit=limit, + class_specific=cfg.TEST.CLASS_SPECIFIC_AR + ) + key = 'AR{}@{:d}'.format(suffix, limit) + res['box_proposal'][key] = stats['ar'] + return OrderedDict([(dataset.name, res)]) + + +def log_box_proposal_results(results): + """Log bounding box proposal results.""" + for dataset in results.keys(): + keys = results[dataset]['box_proposal'].keys() + pad = max([len(k) for k in keys]) + logger.info(dataset) + for k, v in results[dataset]['box_proposal'].items(): + logger.info('{}: {:.3f}'.format(k.ljust(pad), v)) + + +def log_copy_paste_friendly_results(results): + """Log results in a format that makes it easy to copy-and-paste in a + spreadsheet. Lines are prefixed with 'copypaste: ' to make grepping easy. + """ + for dataset in results.keys(): + logger.info('copypaste: Dataset: {}'.format(dataset)) + for task, metrics in results[dataset].items(): + logger.info('copypaste: Task: {}'.format(task)) + metric_names = metrics.keys() + metric_vals = ['{:.4f}'.format(v) for v in metrics.values()] + logger.info('copypaste: ' + ','.join(metric_names)) + logger.info('copypaste: ' + ','.join(metric_vals)) + + +def check_expected_results(results, atol=0.005, rtol=0.1): + """Check actual results against expected results stored in + cfg.EXPECTED_RESULTS. Optionally email if the match exceeds the specified + tolerance. + + Expected results should take the form of a list of expectations, each + specified by four elements: [dataset, task, metric, expected value]. For + example: [['coco_2014_minival', 'box_proposal', 'AR@1000', 0.387], ...]. + + The expected value may also be formatted as a list [mean, std] providing + an empirical mean and standard deviation from which a valid range is computed + using cfg.EXPECTED_RESULTS_SIGMA_TOL. For example: + [['coco_2014_minival', 'box_proposal', 'AR@1000', [0.387, 0.001]], ...] + """ + # cfg contains a reference set of results that we want to check against + if len(cfg.EXPECTED_RESULTS) == 0: + return + + for dataset, task, metric, expected_val in cfg.EXPECTED_RESULTS: + assert dataset in results, 'Dataset {} not in results'.format(dataset) + assert task in results[dataset], 'Task {} not in results'.format(task) + assert metric in results[dataset][task], \ + 'Metric {} not in results'.format(metric) + actual_val = results[dataset][task][metric] + ok = False + if isinstance(expected_val, list): + assert len(expected_val) == 2, ( + 'Expected result must be in (mean, std) format' + ) + mean, std = expected_val + lo = mean - cfg.EXPECTED_RESULTS_SIGMA_TOL * std + hi = mean + cfg.EXPECTED_RESULTS_SIGMA_TOL * std + ok = (lo < actual_val) and (actual_val < hi) + msg = ( + '{} > {} > {} sanity check (actual vs. expected): ' + '{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})' + ).format(dataset, task, metric, actual_val, mean, std, lo, hi) + else: + err = abs(actual_val - expected_val) + tol = atol + rtol * abs(expected_val) + ok = (err > tol) + msg = ( + '{} > {} > {} sanity check (actual vs. expected): ' + '{:.3f} vs. {:.3f}, err={:.3f}, tol={:.3f}' + ).format(dataset, task, metric, actual_val, expected_val, err, tol) + if not ok: + msg = 'FAIL: ' + msg + logger.error(msg) + if cfg.EXPECTED_RESULTS_EMAIL != '': + subject = 'Detectron end-to-end test failure' + job_name = os.environ[ + 'DETECTRON_JOB_NAME' + ] if 'DETECTRON_JOB_NAME' in os.environ else '' + job_id = os.environ[ + 'WORKFLOW_RUN_ID' + ] if 'WORKFLOW_RUN_ID' in os.environ else '' + body = [ + 'Name:', + job_name, + 'Run ID:', + job_id, + 'Failure:', + msg, + 'Config:', + pprint.pformat(cfg), + 'Env:', + pprint.pformat(dict(os.environ)), + ] + send_email( + subject, '\n\n'.join(body), cfg.EXPECTED_RESULTS_EMAIL + ) + else: + msg = 'PASS: ' + msg + logger.info(msg) + + +def _use_json_dataset_evaluator(dataset): + """Check if the dataset uses the general json dataset evaluator.""" + return dataset.name.find('coco_') > -1 or cfg.TEST.FORCE_JSON_DATASET_EVAL + + +def _use_cityscapes_evaluator(dataset): + """Check if the dataset uses the Cityscapes dataset evaluator.""" + return dataset.name.find('cityscapes_') > -1 + + +def _use_voc_evaluator(dataset): + """Check if the dataset uses the PASCAL VOC dataset evaluator.""" + return dataset.name[:4] == 'voc_' + + +# Indices in the stats array for COCO boxes and masks +COCO_AP = 0 +COCO_AP50 = 1 +COCO_AP75 = 2 +COCO_APS = 3 +COCO_APM = 4 +COCO_APL = 5 +# Slight difference for keypoints +COCO_KPS_APM = 3 +COCO_KPS_APL = 4 + + +# ---------------------------------------------------------------------------- # +# Helper functions for producing properly formatted results. +# ---------------------------------------------------------------------------- # + +def _coco_eval_to_box_results(coco_eval): + res = _empty_box_results() + if coco_eval is not None: + s = coco_eval.stats + res['box']['AP'] = s[COCO_AP] + res['box']['AP50'] = s[COCO_AP50] + res['box']['AP75'] = s[COCO_AP75] + res['box']['APs'] = s[COCO_APS] + res['box']['APm'] = s[COCO_APM] + res['box']['APl'] = s[COCO_APL] + return res + + +def _coco_eval_to_mask_results(coco_eval): + res = _empty_mask_results() + if coco_eval is not None: + s = coco_eval.stats + res['mask']['AP'] = s[COCO_AP] + res['mask']['AP50'] = s[COCO_AP50] + res['mask']['AP75'] = s[COCO_AP75] + res['mask']['APs'] = s[COCO_APS] + res['mask']['APm'] = s[COCO_APM] + res['mask']['APl'] = s[COCO_APL] + return res + + +def _coco_eval_to_keypoint_results(coco_eval): + res = _empty_keypoint_results() + if coco_eval is not None: + s = coco_eval.stats + res['keypoint']['AP'] = s[COCO_AP] + res['keypoint']['AP50'] = s[COCO_AP50] + res['keypoint']['AP75'] = s[COCO_AP75] + res['keypoint']['APm'] = s[COCO_KPS_APM] + res['keypoint']['APl'] = s[COCO_KPS_APL] + return res + + +def _voc_eval_to_box_results(voc_eval): + # Not supported (return empty results) + return _empty_box_results() + + +def _cs_eval_to_mask_results(cs_eval): + # Not supported (return empty results) + return _empty_mask_results() + + +def _empty_box_results(): + return OrderedDict({ + 'box': + OrderedDict( + [ + ('AP', -1), + ('AP50', -1), + ('AP75', -1), + ('APs', -1), + ('APm', -1), + ('APl', -1), + ] + ) + }) + + +def _empty_mask_results(): + return OrderedDict({ + 'mask': + OrderedDict( + [ + ('AP', -1), + ('AP50', -1), + ('AP75', -1), + ('APs', -1), + ('APm', -1), + ('APl', -1), + ] + ) + }) + + +def _empty_keypoint_results(): + return OrderedDict({ + 'keypoint': + OrderedDict( + [ + ('AP', -1), + ('AP50', -1), + ('AP75', -1), + ('APm', -1), + ('APl', -1), + ] + ) + }) + + +def _empty_box_proposal_results(): + return OrderedDict({ + 'box_proposal': + OrderedDict( + [ + ('AR@100', -1), + ('ARs@100', -1), + ('ARm@100', -1), + ('ARl@100', -1), + ('AR@1000', -1), + ('ARs@1000', -1), + ('ARm@1000', -1), + ('ARl@1000', -1), + ] + ) + }) diff --git a/detectron/datasets/voc_dataset_evaluator.py b/detectron/datasets/voc_dataset_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..4426fc3d298f0dd597218b4179f6c4f7d3b07922 --- /dev/null +++ b/detectron/datasets/voc_dataset_evaluator.py @@ -0,0 +1,178 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""PASCAL VOC dataset evaluation interface.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import logging +import numpy as np +import os +import shutil +import uuid + +from detectron.core.config import cfg +from detectron.datasets.dataset_catalog import get_devkit_dir +from detectron.datasets.voc_eval import voc_eval +from detectron.utils.io import save_object + +logger = logging.getLogger(__name__) + + +def evaluate_boxes( + json_dataset, + all_boxes, + output_dir, + use_salt=True, + cleanup=True, + use_matlab=False +): + salt = '_{}'.format(str(uuid.uuid4())) if use_salt else '' + filenames = _write_voc_results_files(json_dataset, all_boxes, salt) + _do_python_eval(json_dataset, salt, output_dir) + if use_matlab: + _do_matlab_eval(json_dataset, salt, output_dir) + if cleanup: + for filename in filenames: + shutil.copy(filename, output_dir) + os.remove(filename) + return None + + +def _write_voc_results_files(json_dataset, all_boxes, salt): + filenames = [] + image_set_path = voc_info(json_dataset)['image_set_path'] + assert os.path.exists(image_set_path), \ + 'Image set path does not exist: {}'.format(image_set_path) + with open(image_set_path, 'r') as f: + image_index = [x.strip() for x in f.readlines()] + # Sanity check that order of images in json dataset matches order in the + # image set + roidb = json_dataset.get_roidb() + for i, entry in enumerate(roidb): + index = os.path.splitext(os.path.split(entry['image'])[1])[0] + assert index == image_index[i] + for cls_ind, cls in enumerate(json_dataset.classes): + if cls == '__background__': + continue + logger.info('Writing VOC results for: {}'.format(cls)) + filename = _get_voc_results_file_template(json_dataset, + salt).format(cls) + filenames.append(filename) + assert len(all_boxes[cls_ind]) == len(image_index) + with open(filename, 'wt') as f: + for im_ind, index in enumerate(image_index): + dets = all_boxes[cls_ind][im_ind] + if type(dets) == list: + assert len(dets) == 0, \ + 'dets should be numpy.ndarray or empty list' + continue + # the VOCdevkit expects 1-based indices + for k in range(dets.shape[0]): + f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. + format(index, dets[k, -1], + dets[k, 0] + 1, dets[k, 1] + 1, + dets[k, 2] + 1, dets[k, 3] + 1)) + return filenames + + +def _get_voc_results_file_template(json_dataset, salt): + info = voc_info(json_dataset) + year = info['year'] + image_set = info['image_set'] + devkit_path = info['devkit_path'] + # VOCdevkit/results/VOC2007/Main/_det_test_aeroplane.txt + filename = 'comp4' + salt + '_det_' + image_set + '_{:s}.txt' + return os.path.join(devkit_path, 'results', 'VOC' + year, 'Main', filename) + + +def _do_python_eval(json_dataset, salt, output_dir='output'): + info = voc_info(json_dataset) + year = info['year'] + anno_path = info['anno_path'] + image_set_path = info['image_set_path'] + devkit_path = info['devkit_path'] + cachedir = os.path.join(devkit_path, 'annotations_cache') + aps = [] + # The PASCAL VOC metric changed in 2010 + use_07_metric = True if int(year) < 2010 else False + logger.info('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) + if not os.path.isdir(output_dir): + os.mkdir(output_dir) + for _, cls in enumerate(json_dataset.classes): + if cls == '__background__': + continue + filename = _get_voc_results_file_template( + json_dataset, salt).format(cls) + rec, prec, ap = voc_eval( + filename, anno_path, image_set_path, cls, cachedir, ovthresh=0.5, + use_07_metric=use_07_metric) + aps += [ap] + logger.info('AP for {} = {:.4f}'.format(cls, ap)) + res_file = os.path.join(output_dir, cls + '_pr.pkl') + save_object({'rec': rec, 'prec': prec, 'ap': ap}, res_file) + logger.info('Mean AP = {:.4f}'.format(np.mean(aps))) + logger.info('~~~~~~~~') + logger.info('Results:') + for ap in aps: + logger.info('{:.3f}'.format(ap)) + logger.info('{:.3f}'.format(np.mean(aps))) + logger.info('~~~~~~~~') + logger.info('') + logger.info('----------------------------------------------------------') + logger.info('Results computed with the **unofficial** Python eval code.') + logger.info('Results should be very close to the official MATLAB code.') + logger.info('Use `./tools/reval.py --matlab ...` for your paper.') + logger.info('-- Thanks, The Management') + logger.info('----------------------------------------------------------') + + +def _do_matlab_eval(json_dataset, salt, output_dir='output'): + import subprocess + logger.info('-----------------------------------------------------') + logger.info('Computing results with the official MATLAB eval code.') + logger.info('-----------------------------------------------------') + info = voc_info(json_dataset) + path = os.path.join( + cfg.ROOT_DIR, 'detectron', 'datasets', 'VOCdevkit-matlab-wrapper') + cmd = 'cd {} && '.format(path) + cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB) + cmd += '-r "dbstop if error; ' + cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \ + .format(info['devkit_path'], 'comp4' + salt, info['image_set'], + output_dir) + logger.info('Running:\n{}'.format(cmd)) + subprocess.call(cmd, shell=True) + + +def voc_info(json_dataset): + year = json_dataset.name[4:8] + image_set = json_dataset.name[9:] + devkit_path = get_devkit_dir(json_dataset.name) + assert os.path.exists(devkit_path), \ + 'Devkit directory {} not found'.format(devkit_path) + anno_path = os.path.join( + devkit_path, 'VOC' + year, 'Annotations', '{:s}.xml') + image_set_path = os.path.join( + devkit_path, 'VOC' + year, 'ImageSets', 'Main', image_set + '.txt') + return dict( + year=year, + image_set=image_set, + devkit_path=devkit_path, + anno_path=anno_path, + image_set_path=image_set_path) diff --git a/detectron/datasets/voc_eval.py b/detectron/datasets/voc_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..1497305de86d3cc800fd2fa24d4da16855f85bf7 --- /dev/null +++ b/detectron/datasets/voc_eval.py @@ -0,0 +1,222 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast/er R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Bharath Hariharan +# -------------------------------------------------------- + +"""Python implementation of the PASCAL VOC devkit's AP evaluation code.""" + +import logging +import numpy as np +import os +import xml.etree.ElementTree as ET + +from detectron.utils.io import load_object +from detectron.utils.io import save_object + +logger = logging.getLogger(__name__) + + +def parse_rec(filename): + """Parse a PASCAL VOC xml file.""" + tree = ET.parse(filename) + objects = [] + for obj in tree.findall('object'): + obj_struct = {} + obj_struct['name'] = obj.find('name').text + obj_struct['pose'] = obj.find('pose').text + obj_struct['truncated'] = int(obj.find('truncated').text) + obj_struct['difficult'] = int(obj.find('difficult').text) + bbox = obj.find('bndbox') + obj_struct['bbox'] = [int(bbox.find('xmin').text), + int(bbox.find('ymin').text), + int(bbox.find('xmax').text), + int(bbox.find('ymax').text)] + objects.append(obj_struct) + + return objects + + +def voc_ap(rec, prec, use_07_metric=False): + """Compute VOC AP given precision and recall. If use_07_metric is true, uses + the VOC 07 11-point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def voc_eval(detpath, + annopath, + imagesetfile, + classname, + cachedir, + ovthresh=0.5, + use_07_metric=False): + """rec, prec, ap = voc_eval(detpath, + annopath, + imagesetfile, + classname, + [ovthresh], + [use_07_metric]) + + Top level function that does the PASCAL VOC evaluation. + + detpath: Path to detections + detpath.format(classname) should produce the detection results file. + annopath: Path to annotations + annopath.format(imagename) should be the xml annotations file. + imagesetfile: Text file containing the list of images, one image per line. + classname: Category name (duh) + cachedir: Directory for caching the annotations + [ovthresh]: Overlap threshold (default = 0.5) + [use_07_metric]: Whether to use VOC07's 11 point AP computation + (default False) + """ + # assumes detections are in detpath.format(classname) + # assumes annotations are in annopath.format(imagename) + # assumes imagesetfile is a text file with each line an image name + # cachedir caches the annotations in a pickle file + + # first load gt + if not os.path.isdir(cachedir): + os.mkdir(cachedir) + imageset = os.path.splitext(os.path.basename(imagesetfile))[0] + cachefile = os.path.join(cachedir, imageset + '_annots.pkl') + # read list of images + with open(imagesetfile, 'r') as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + + if not os.path.isfile(cachefile): + # load annots + recs = {} + for i, imagename in enumerate(imagenames): + recs[imagename] = parse_rec(annopath.format(imagename)) + if i % 100 == 0: + logger.info( + 'Reading annotation for {:d}/{:d}'.format( + i + 1, len(imagenames))) + # save + logger.info('Saving cached annotations to {:s}'.format(cachefile)) + save_object(recs, cachefile) + else: + recs = load_object(cachefile) + + # extract gt objects for this class + class_recs = {} + npos = 0 + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj['name'] == classname] + bbox = np.array([x['bbox'] for x in R]) + difficult = np.array([x['difficult'] for x in R]).astype(np.bool) + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {'bbox': bbox, + 'difficult': difficult, + 'det': det} + + # read dets + detfile = detpath.format(classname) + with open(detfile, 'r') as f: + lines = f.readlines() + + splitlines = [x.strip().split(' ') for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # union + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R['difficult'][jmax]: + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap diff --git a/detectron/modeling/FPN.py b/detectron/modeling/FPN.py new file mode 100644 index 0000000000000000000000000000000000000000..46881e49708ff6b5a4f5657897a49e4ebb53052e --- /dev/null +++ b/detectron/modeling/FPN.py @@ -0,0 +1,568 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Functions for using a Feature Pyramid Network (FPN).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import collections +import numpy as np + +from detectron.core.config import cfg +from detectron.modeling.generate_anchors import generate_anchors +from detectron.utils.c2 import const_fill +from detectron.utils.c2 import gauss_fill +from detectron.utils.net import get_group_gn +import detectron.modeling.ResNet as ResNet +import detectron.utils.blob as blob_utils +import detectron.utils.boxes as box_utils + +# Lowest and highest pyramid levels in the backbone network. For FPN, we assume +# that all networks have 5 spatial reductions, each by a factor of 2. Level 1 +# would correspond to the input image, hence it does not make sense to use it. +LOWEST_BACKBONE_LVL = 2 # E.g., "conv2"-like level +HIGHEST_BACKBONE_LVL = 5 # E.g., "conv5"-like level + + +# ---------------------------------------------------------------------------- # +# FPN with ResNet +# ---------------------------------------------------------------------------- # + +def add_fpn_ResNet50_conv5_body(model): + return add_fpn_onto_conv_body( + model, ResNet.add_ResNet50_conv5_body, fpn_level_info_ResNet50_conv5 + ) + + +def add_fpn_ResNet50_conv5_P2only_body(model): + return add_fpn_onto_conv_body( + model, + ResNet.add_ResNet50_conv5_body, + fpn_level_info_ResNet50_conv5, + P2only=True + ) + + +def add_fpn_ResNet101_conv5_body(model): + return add_fpn_onto_conv_body( + model, ResNet.add_ResNet101_conv5_body, fpn_level_info_ResNet101_conv5 + ) + + +def add_fpn_ResNet101_conv5_P2only_body(model): + return add_fpn_onto_conv_body( + model, + ResNet.add_ResNet101_conv5_body, + fpn_level_info_ResNet101_conv5, + P2only=True + ) + + +def add_fpn_ResNet152_conv5_body(model): + return add_fpn_onto_conv_body( + model, ResNet.add_ResNet152_conv5_body, fpn_level_info_ResNet152_conv5 + ) + + +def add_fpn_ResNet152_conv5_P2only_body(model): + return add_fpn_onto_conv_body( + model, + ResNet.add_ResNet152_conv5_body, + fpn_level_info_ResNet152_conv5, + P2only=True + ) + + +# ---------------------------------------------------------------------------- # +# Functions for bolting FPN onto a backbone architectures +# ---------------------------------------------------------------------------- # + +def add_fpn_onto_conv_body( + model, conv_body_func, fpn_level_info_func, P2only=False +): + """Add the specified conv body to the model and then add FPN levels to it. + """ + # Note: blobs_conv is in revsersed order: [fpn5, fpn4, fpn3, fpn2] + # similarly for dims_conv: [2048, 1024, 512, 256] + # similarly for spatial_scales_fpn: [1/32, 1/16, 1/8, 1/4] + + conv_body_func(model) + blobs_fpn, dim_fpn, spatial_scales_fpn = add_fpn( + model, fpn_level_info_func() + ) + + if P2only: + # use only the finest level + return blobs_fpn[-1], dim_fpn, spatial_scales_fpn[-1] + else: + # use all levels + return blobs_fpn, dim_fpn, spatial_scales_fpn + + +def add_fpn(model, fpn_level_info): + """Add FPN connections based on the model described in the FPN paper.""" + # FPN levels are built starting from the highest/coarest level of the + # backbone (usually "conv5"). First we build down, recursively constructing + # lower/finer resolution FPN levels. Then we build up, constructing levels + # that are even higher/coarser than the starting level. + fpn_dim = cfg.FPN.DIM + min_level, max_level = get_min_max_levels() + # Count the number of backbone stages that we will generate FPN levels for + # starting from the coarest backbone stage (usually the "conv5"-like level) + # E.g., if the backbone level info defines stages 4 stages: "conv5", + # "conv4", ... "conv2" and min_level=2, then we end up with 4 - (2 - 2) = 4 + # backbone stages to add FPN to. + num_backbone_stages = ( + len(fpn_level_info.blobs) - (min_level - LOWEST_BACKBONE_LVL) + ) + + lateral_input_blobs = fpn_level_info.blobs[:num_backbone_stages] + output_blobs = [ + 'fpn_inner_{}'.format(s) + for s in fpn_level_info.blobs[:num_backbone_stages] + ] + fpn_dim_lateral = fpn_level_info.dims + xavier_fill = ('XavierFill', {}) + + # For the coarsest backbone level: 1x1 conv only seeds recursion + if cfg.FPN.USE_GN: + # use GroupNorm + c = model.ConvGN( + lateral_input_blobs[0], + output_blobs[0], # note: this is a prefix + dim_in=fpn_dim_lateral[0], + dim_out=fpn_dim, + group_gn=get_group_gn(fpn_dim), + kernel=1, + pad=0, + stride=1, + weight_init=xavier_fill, + bias_init=const_fill(0.0) + ) + output_blobs[0] = c # rename it + else: + model.Conv( + lateral_input_blobs[0], + output_blobs[0], + dim_in=fpn_dim_lateral[0], + dim_out=fpn_dim, + kernel=1, + pad=0, + stride=1, + weight_init=xavier_fill, + bias_init=const_fill(0.0) + ) + + # + # Step 1: recursively build down starting from the coarsest backbone level + # + + # For other levels add top-down and lateral connections + for i in range(num_backbone_stages - 1): + add_topdown_lateral_module( + model, + output_blobs[i], # top-down blob + lateral_input_blobs[i + 1], # lateral blob + output_blobs[i + 1], # next output blob + fpn_dim, # output dimension + fpn_dim_lateral[i + 1] # lateral input dimension + ) + + # Post-hoc scale-specific 3x3 convs + blobs_fpn = [] + spatial_scales = [] + for i in range(num_backbone_stages): + if cfg.FPN.USE_GN: + # use GroupNorm + fpn_blob = model.ConvGN( + output_blobs[i], + 'fpn_{}'.format(fpn_level_info.blobs[i]), + dim_in=fpn_dim, + dim_out=fpn_dim, + group_gn=get_group_gn(fpn_dim), + kernel=3, + pad=1, + stride=1, + weight_init=xavier_fill, + bias_init=const_fill(0.0) + ) + else: + fpn_blob = model.Conv( + output_blobs[i], + 'fpn_{}'.format(fpn_level_info.blobs[i]), + dim_in=fpn_dim, + dim_out=fpn_dim, + kernel=3, + pad=1, + stride=1, + weight_init=xavier_fill, + bias_init=const_fill(0.0) + ) + blobs_fpn += [fpn_blob] + spatial_scales += [fpn_level_info.spatial_scales[i]] + + # + # Step 2: build up starting from the coarsest backbone level + # + + # Check if we need the P6 feature map + if not cfg.FPN.EXTRA_CONV_LEVELS and max_level == HIGHEST_BACKBONE_LVL + 1: + # Original FPN P6 level implementation from our CVPR'17 FPN paper + P6_blob_in = blobs_fpn[0] + P6_name = P6_blob_in + '_subsampled_2x' + # Use max pooling to simulate stride 2 subsampling + P6_blob = model.MaxPool(P6_blob_in, P6_name, kernel=1, pad=0, stride=2) + blobs_fpn.insert(0, P6_blob) + spatial_scales.insert(0, spatial_scales[0] * 0.5) + + # Coarser FPN levels introduced for RetinaNet + if cfg.FPN.EXTRA_CONV_LEVELS and max_level > HIGHEST_BACKBONE_LVL: + fpn_blob = fpn_level_info.blobs[0] + dim_in = fpn_level_info.dims[0] + for i in range(HIGHEST_BACKBONE_LVL + 1, max_level + 1): + fpn_blob_in = fpn_blob + if i > HIGHEST_BACKBONE_LVL + 1: + fpn_blob_in = model.Relu(fpn_blob, fpn_blob + '_relu') + fpn_blob = model.Conv( + fpn_blob_in, + 'fpn_' + str(i), + dim_in=dim_in, + dim_out=fpn_dim, + kernel=3, + pad=1, + stride=2, + weight_init=xavier_fill, + bias_init=const_fill(0.0) + ) + dim_in = fpn_dim + blobs_fpn.insert(0, fpn_blob) + spatial_scales.insert(0, spatial_scales[0] * 0.5) + + return blobs_fpn, fpn_dim, spatial_scales + + +def add_topdown_lateral_module( + model, fpn_top, fpn_lateral, fpn_bottom, dim_top, dim_lateral +): + """Add a top-down lateral module.""" + # Lateral 1x1 conv + if cfg.FPN.USE_GN: + # use GroupNorm + lat = model.ConvGN( + fpn_lateral, + fpn_bottom + '_lateral', + dim_in=dim_lateral, + dim_out=dim_top, + group_gn=get_group_gn(dim_top), + kernel=1, + pad=0, + stride=1, + weight_init=( + const_fill(0.0) if cfg.FPN.ZERO_INIT_LATERAL + else ('XavierFill', {})), + bias_init=const_fill(0.0) + ) + else: + lat = model.Conv( + fpn_lateral, + fpn_bottom + '_lateral', + dim_in=dim_lateral, + dim_out=dim_top, + kernel=1, + pad=0, + stride=1, + weight_init=( + const_fill(0.0) + if cfg.FPN.ZERO_INIT_LATERAL else ('XavierFill', {}) + ), + bias_init=const_fill(0.0) + ) + # Top-down 2x upsampling + td = model.net.UpsampleNearest(fpn_top, fpn_bottom + '_topdown', scale=2) + # Sum lateral and top-down + model.net.Sum([lat, td], fpn_bottom) + + +def get_min_max_levels(): + """The min and max FPN levels required for supporting RPN and/or RoI + transform operations on multiple FPN levels. + """ + min_level = LOWEST_BACKBONE_LVL + max_level = HIGHEST_BACKBONE_LVL + if cfg.FPN.MULTILEVEL_RPN and not cfg.FPN.MULTILEVEL_ROIS: + max_level = cfg.FPN.RPN_MAX_LEVEL + min_level = cfg.FPN.RPN_MIN_LEVEL + if not cfg.FPN.MULTILEVEL_RPN and cfg.FPN.MULTILEVEL_ROIS: + max_level = cfg.FPN.ROI_MAX_LEVEL + min_level = cfg.FPN.ROI_MIN_LEVEL + if cfg.FPN.MULTILEVEL_RPN and cfg.FPN.MULTILEVEL_ROIS: + max_level = max(cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.ROI_MAX_LEVEL) + min_level = min(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.ROI_MIN_LEVEL) + return min_level, max_level + + +# ---------------------------------------------------------------------------- # +# RPN with an FPN backbone +# ---------------------------------------------------------------------------- # + +def add_fpn_rpn_outputs(model, blobs_in, dim_in, spatial_scales): + """Add RPN on FPN specific outputs.""" + num_anchors = len(cfg.FPN.RPN_ASPECT_RATIOS) + dim_out = dim_in + + k_max = cfg.FPN.RPN_MAX_LEVEL # coarsest level of pyramid + k_min = cfg.FPN.RPN_MIN_LEVEL # finest level of pyramid + assert len(blobs_in) == k_max - k_min + 1 + for lvl in range(k_min, k_max + 1): + bl_in = blobs_in[k_max - lvl] # blobs_in is in reversed order + sc = spatial_scales[k_max - lvl] # in reversed order + slvl = str(lvl) + + if lvl == k_min: + # Create conv ops with randomly initialized weights and + # zeroed biases for the first FPN level; these will be shared by + # all other FPN levels + # RPN hidden representation + conv_rpn_fpn = model.Conv( + bl_in, + 'conv_rpn_fpn' + slvl, + dim_in, + dim_out, + kernel=3, + pad=1, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + model.Relu(conv_rpn_fpn, conv_rpn_fpn) + # Proposal classification scores + rpn_cls_logits_fpn = model.Conv( + conv_rpn_fpn, + 'rpn_cls_logits_fpn' + slvl, + dim_in, + num_anchors, + kernel=1, + pad=0, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + # Proposal bbox regression deltas + rpn_bbox_pred_fpn = model.Conv( + conv_rpn_fpn, + 'rpn_bbox_pred_fpn' + slvl, + dim_in, + 4 * num_anchors, + kernel=1, + pad=0, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + else: + # Share weights and biases + sk_min = str(k_min) + # RPN hidden representation + conv_rpn_fpn = model.ConvShared( + bl_in, + 'conv_rpn_fpn' + slvl, + dim_in, + dim_out, + kernel=3, + pad=1, + stride=1, + weight='conv_rpn_fpn' + sk_min + '_w', + bias='conv_rpn_fpn' + sk_min + '_b' + ) + model.Relu(conv_rpn_fpn, conv_rpn_fpn) + # Proposal classification scores + rpn_cls_logits_fpn = model.ConvShared( + conv_rpn_fpn, + 'rpn_cls_logits_fpn' + slvl, + dim_in, + num_anchors, + kernel=1, + pad=0, + stride=1, + weight='rpn_cls_logits_fpn' + sk_min + '_w', + bias='rpn_cls_logits_fpn' + sk_min + '_b' + ) + # Proposal bbox regression deltas + rpn_bbox_pred_fpn = model.ConvShared( + conv_rpn_fpn, + 'rpn_bbox_pred_fpn' + slvl, + dim_in, + 4 * num_anchors, + kernel=1, + pad=0, + stride=1, + weight='rpn_bbox_pred_fpn' + sk_min + '_w', + bias='rpn_bbox_pred_fpn' + sk_min + '_b' + ) + + if not model.train or cfg.MODEL.FASTER_RCNN: + # Proposals are needed during: + # 1) inference (== not model.train) for RPN only and Faster R-CNN + # OR + # 2) training for Faster R-CNN + # Otherwise (== training for RPN only), proposals are not needed + lvl_anchors = generate_anchors( + stride=2.**lvl, + sizes=(cfg.FPN.RPN_ANCHOR_START_SIZE * 2.**(lvl - k_min), ), + aspect_ratios=cfg.FPN.RPN_ASPECT_RATIOS + ) + rpn_cls_probs_fpn = model.net.Sigmoid( + rpn_cls_logits_fpn, 'rpn_cls_probs_fpn' + slvl + ) + model.GenerateProposals( + [rpn_cls_probs_fpn, rpn_bbox_pred_fpn, 'im_info'], + ['rpn_rois_fpn' + slvl, 'rpn_roi_probs_fpn' + slvl], + anchors=lvl_anchors, + spatial_scale=sc + ) + + +def add_fpn_rpn_losses(model): + """Add RPN on FPN specific losses.""" + loss_gradients = {} + for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1): + slvl = str(lvl) + # Spatially narrow the full-sized RPN label arrays to match the feature map + # shape + model.net.SpatialNarrowAs( + ['rpn_labels_int32_wide_fpn' + slvl, 'rpn_cls_logits_fpn' + slvl], + 'rpn_labels_int32_fpn' + slvl + ) + for key in ('targets', 'inside_weights', 'outside_weights'): + model.net.SpatialNarrowAs( + [ + 'rpn_bbox_' + key + '_wide_fpn' + slvl, + 'rpn_bbox_pred_fpn' + slvl + ], + 'rpn_bbox_' + key + '_fpn' + slvl + ) + loss_rpn_cls_fpn = model.net.SigmoidCrossEntropyLoss( + ['rpn_cls_logits_fpn' + slvl, 'rpn_labels_int32_fpn' + slvl], + 'loss_rpn_cls_fpn' + slvl, + normalize=0, + scale=( + model.GetLossScale() / cfg.TRAIN.RPN_BATCH_SIZE_PER_IM / + cfg.TRAIN.IMS_PER_BATCH + ) + ) + # Normalization by (1) RPN_BATCH_SIZE_PER_IM and (2) IMS_PER_BATCH is + # handled by (1) setting bbox outside weights and (2) SmoothL1Loss + # normalizes by IMS_PER_BATCH + loss_rpn_bbox_fpn = model.net.SmoothL1Loss( + [ + 'rpn_bbox_pred_fpn' + slvl, 'rpn_bbox_targets_fpn' + slvl, + 'rpn_bbox_inside_weights_fpn' + slvl, + 'rpn_bbox_outside_weights_fpn' + slvl + ], + 'loss_rpn_bbox_fpn' + slvl, + beta=1. / 9., + scale=model.GetLossScale(), + ) + loss_gradients.update( + blob_utils. + get_loss_gradients(model, [loss_rpn_cls_fpn, loss_rpn_bbox_fpn]) + ) + model.AddLosses(['loss_rpn_cls_fpn' + slvl, 'loss_rpn_bbox_fpn' + slvl]) + return loss_gradients + + +# ---------------------------------------------------------------------------- # +# Helper functions for working with multilevel FPN RoIs +# ---------------------------------------------------------------------------- # + +def map_rois_to_fpn_levels(rois, k_min, k_max): + """Determine which FPN level each RoI in a set of RoIs should map to based + on the heuristic in the FPN paper. + """ + # Compute level ids + s = np.sqrt(box_utils.boxes_area(rois)) + s0 = cfg.FPN.ROI_CANONICAL_SCALE # default: 224 + lvl0 = cfg.FPN.ROI_CANONICAL_LEVEL # default: 4 + + # Eqn.(1) in FPN paper + target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6)) + target_lvls = np.clip(target_lvls, k_min, k_max) + return target_lvls + + +def add_multilevel_roi_blobs( + blobs, blob_prefix, rois, target_lvls, lvl_min, lvl_max +): + """Add RoI blobs for multiple FPN levels to the blobs dict. + + blobs: a dict mapping from blob name to numpy ndarray + blob_prefix: name prefix to use for the FPN blobs + rois: the source rois as a 2D numpy array of shape (N, 5) where each row is + an roi and the columns encode (batch_idx, x1, y1, x2, y2) + target_lvls: numpy array of shape (N, ) indicating which FPN level each roi + in rois should be assigned to + lvl_min: the finest (highest resolution) FPN level (e.g., 2) + lvl_max: the coarest (lowest resolution) FPN level (e.g., 6) + """ + rois_idx_order = np.empty((0, )) + rois_stacked = np.zeros((0, 5), dtype=np.float32) # for assert + for lvl in range(lvl_min, lvl_max + 1): + idx_lvl = np.where(target_lvls == lvl)[0] + blobs[blob_prefix + '_fpn' + str(lvl)] = rois[idx_lvl, :] + rois_idx_order = np.concatenate((rois_idx_order, idx_lvl)) + rois_stacked = np.vstack( + [rois_stacked, blobs[blob_prefix + '_fpn' + str(lvl)]] + ) + rois_idx_restore = np.argsort(rois_idx_order).astype(np.int32, copy=False) + blobs[blob_prefix + '_idx_restore_int32'] = rois_idx_restore + # Sanity check that restore order is correct + assert (rois_stacked[rois_idx_restore] == rois).all() + + +# ---------------------------------------------------------------------------- # +# FPN level info for stages 5, 4, 3, 2 for select models (more can be added) +# ---------------------------------------------------------------------------- # + +FpnLevelInfo = collections.namedtuple( + 'FpnLevelInfo', + ['blobs', 'dims', 'spatial_scales'] +) + + +def fpn_level_info_ResNet50_conv5(): + return FpnLevelInfo( + blobs=('res5_2_sum', 'res4_5_sum', 'res3_3_sum', 'res2_2_sum'), + dims=(2048, 1024, 512, 256), + spatial_scales=(1. / 32., 1. / 16., 1. / 8., 1. / 4.) + ) + + +def fpn_level_info_ResNet101_conv5(): + return FpnLevelInfo( + blobs=('res5_2_sum', 'res4_22_sum', 'res3_3_sum', 'res2_2_sum'), + dims=(2048, 1024, 512, 256), + spatial_scales=(1. / 32., 1. / 16., 1. / 8., 1. / 4.) + ) + + +def fpn_level_info_ResNet152_conv5(): + return FpnLevelInfo( + blobs=('res5_2_sum', 'res4_35_sum', 'res3_7_sum', 'res2_2_sum'), + dims=(2048, 1024, 512, 256), + spatial_scales=(1. / 32., 1. / 16., 1. / 8., 1. / 4.) + ) diff --git a/detectron/modeling/ResNet.py b/detectron/modeling/ResNet.py new file mode 100644 index 0000000000000000000000000000000000000000..ae602d97c92ef9d1d8584b2e9742c585b4502d8c --- /dev/null +++ b/detectron/modeling/ResNet.py @@ -0,0 +1,391 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Implements ResNet and ResNeXt. + +See: https://arxiv.org/abs/1512.03385, https://arxiv.org/abs/1611.05431. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.core.config import cfg +from detectron.utils.net import get_group_gn + + +# ---------------------------------------------------------------------------- # +# Bits for specific architectures (ResNet50, ResNet101, ...) +# ---------------------------------------------------------------------------- # + + +def add_ResNet50_conv4_body(model): + return add_ResNet_convX_body(model, (3, 4, 6)) + + +def add_ResNet50_conv5_body(model): + return add_ResNet_convX_body(model, (3, 4, 6, 3)) + + +def add_ResNet101_conv4_body(model): + return add_ResNet_convX_body(model, (3, 4, 23)) + + +def add_ResNet101_conv5_body(model): + return add_ResNet_convX_body(model, (3, 4, 23, 3)) + + +def add_ResNet152_conv5_body(model): + return add_ResNet_convX_body(model, (3, 8, 36, 3)) + + +# ---------------------------------------------------------------------------- # +# Generic ResNet components +# ---------------------------------------------------------------------------- # + + +def add_stage( + model, + prefix, + blob_in, + n, + dim_in, + dim_out, + dim_inner, + dilation, + stride_init=2 +): + """Add a ResNet stage to the model by stacking n residual blocks.""" + # e.g., prefix = res2 + for i in range(n): + blob_in = add_residual_block( + model, + '{}_{}'.format(prefix, i), + blob_in, + dim_in, + dim_out, + dim_inner, + dilation, + stride_init, + # Not using inplace for the last block; + # it may be fetched externally or used by FPN + inplace_sum=i < n - 1 + ) + dim_in = dim_out + return blob_in, dim_in + + +def add_ResNet_convX_body(model, block_counts): + """Add a ResNet body from input data up through the res5 (aka conv5) stage. + The final res5/conv5 stage may be optionally excluded (hence convX, where + X = 4 or 5).""" + freeze_at = cfg.TRAIN.FREEZE_AT + assert freeze_at in [0, 2, 3, 4, 5] + + # add the stem (by default, conv1 and pool1 with bn; can support gn) + p, dim_in = globals()[cfg.RESNETS.STEM_FUNC](model, 'data') + + dim_bottleneck = cfg.RESNETS.NUM_GROUPS * cfg.RESNETS.WIDTH_PER_GROUP + (n1, n2, n3) = block_counts[:3] + s, dim_in = add_stage(model, 'res2', p, n1, dim_in, 256, dim_bottleneck, 1) + if freeze_at == 2: + model.StopGradient(s, s) + s, dim_in = add_stage( + model, 'res3', s, n2, dim_in, 512, dim_bottleneck * 2, 1 + ) + if freeze_at == 3: + model.StopGradient(s, s) + s, dim_in = add_stage( + model, 'res4', s, n3, dim_in, 1024, dim_bottleneck * 4, 1 + ) + if freeze_at == 4: + model.StopGradient(s, s) + if len(block_counts) == 4: + n4 = block_counts[3] + s, dim_in = add_stage( + model, 'res5', s, n4, dim_in, 2048, dim_bottleneck * 8, + cfg.RESNETS.RES5_DILATION + ) + if freeze_at == 5: + model.StopGradient(s, s) + return s, dim_in, 1. / 32. * cfg.RESNETS.RES5_DILATION + else: + return s, dim_in, 1. / 16. + + +def add_ResNet_roi_conv5_head(model, blob_in, dim_in, spatial_scale): + """Adds an RoI feature transformation (e.g., RoI pooling) followed by a + res5/conv5 head applied to each RoI.""" + # TODO(rbg): This contains Fast R-CNN specific config options making it non- + # reusable; make this more generic with model-specific wrappers + model.RoIFeatureTransform( + blob_in, + 'pool5', + blob_rois='rois', + method=cfg.FAST_RCNN.ROI_XFORM_METHOD, + resolution=cfg.FAST_RCNN.ROI_XFORM_RESOLUTION, + sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + dim_bottleneck = cfg.RESNETS.NUM_GROUPS * cfg.RESNETS.WIDTH_PER_GROUP + stride_init = int(cfg.FAST_RCNN.ROI_XFORM_RESOLUTION / 7) + s, dim_in = add_stage( + model, 'res5', 'pool5', 3, dim_in, 2048, dim_bottleneck * 8, 1, + stride_init + ) + s = model.AveragePool(s, 'res5_pool', kernel=7) + return s, 2048 + + +def add_residual_block( + model, + prefix, + blob_in, + dim_in, + dim_out, + dim_inner, + dilation, + stride_init=2, + inplace_sum=False +): + """Add a residual block to the model.""" + # prefix = res_, e.g., res2_3 + + # Max pooling is performed prior to the first stage (which is uniquely + # distinguished by dim_in = 64), thus we keep stride = 1 for the first stage + stride = stride_init if ( + dim_in != dim_out and dim_in != 64 and dilation == 1 + ) else 1 + + # transformation blob + tr = globals()[cfg.RESNETS.TRANS_FUNC]( + model, + blob_in, + dim_in, + dim_out, + stride, + prefix, + dim_inner, + group=cfg.RESNETS.NUM_GROUPS, + dilation=dilation + ) + + # sum -> ReLU + # shortcut function: by default using bn; support gn + add_shortcut = globals()[cfg.RESNETS.SHORTCUT_FUNC] + sc = add_shortcut(model, prefix, blob_in, dim_in, dim_out, stride) + if inplace_sum: + s = model.net.Sum([tr, sc], tr) + else: + s = model.net.Sum([tr, sc], prefix + '_sum') + + return model.Relu(s, s) + + +# ------------------------------------------------------------------------------ +# various shortcuts (may expand and may consider a new helper) +# ------------------------------------------------------------------------------ + + +def basic_bn_shortcut(model, prefix, blob_in, dim_in, dim_out, stride): + """ For a pre-trained network that used BN. An AffineChannel op replaces BN + during fine-tuning. + """ + + if dim_in == dim_out: + return blob_in + + c = model.Conv( + blob_in, + prefix + '_branch1', + dim_in, + dim_out, + kernel=1, + stride=stride, + no_bias=1 + ) + return model.AffineChannel(c, prefix + '_branch1_bn', dim=dim_out) + + +def basic_gn_shortcut(model, prefix, blob_in, dim_in, dim_out, stride): + if dim_in == dim_out: + return blob_in + + # output name is prefix + '_branch1_gn' + return model.ConvGN( + blob_in, + prefix + '_branch1', + dim_in, + dim_out, + kernel=1, + group_gn=get_group_gn(dim_out), + stride=stride, + pad=0, + group=1, + ) + + +# ------------------------------------------------------------------------------ +# various stems (may expand and may consider a new helper) +# ------------------------------------------------------------------------------ + + +def basic_bn_stem(model, data, **kwargs): + """Add a basic ResNet stem. For a pre-trained network that used BN. + An AffineChannel op replaces BN during fine-tuning. + """ + + dim = 64 + p = model.Conv(data, 'conv1', 3, dim, 7, pad=3, stride=2, no_bias=1) + p = model.AffineChannel(p, 'res_conv1_bn', dim=dim, inplace=True) + p = model.Relu(p, p) + p = model.MaxPool(p, 'pool1', kernel=3, pad=1, stride=2) + return p, dim + + +def basic_gn_stem(model, data, **kwargs): + """Add a basic ResNet stem (using GN)""" + + dim = 64 + p = model.ConvGN( + data, 'conv1', 3, dim, 7, group_gn=get_group_gn(dim), pad=3, stride=2 + ) + p = model.Relu(p, p) + p = model.MaxPool(p, 'pool1', kernel=3, pad=1, stride=2) + return p, dim + + +# ------------------------------------------------------------------------------ +# various transformations (may expand and may consider a new helper) +# ------------------------------------------------------------------------------ + + +def bottleneck_transformation( + model, + blob_in, + dim_in, + dim_out, + stride, + prefix, + dim_inner, + dilation=1, + group=1 +): + """Add a bottleneck transformation to the model.""" + # In original resnet, stride=2 is on 1x1. + # In fb.torch resnet, stride=2 is on 3x3. + (str1x1, str3x3) = (stride, 1) if cfg.RESNETS.STRIDE_1X1 else (1, stride) + + # conv 1x1 -> BN -> ReLU + cur = model.ConvAffine( + blob_in, + prefix + '_branch2a', + dim_in, + dim_inner, + kernel=1, + stride=str1x1, + pad=0, + inplace=True + ) + cur = model.Relu(cur, cur) + + # conv 3x3 -> BN -> ReLU + cur = model.ConvAffine( + cur, + prefix + '_branch2b', + dim_inner, + dim_inner, + kernel=3, + stride=str3x3, + pad=1 * dilation, + dilation=dilation, + group=group, + inplace=True + ) + cur = model.Relu(cur, cur) + + # conv 1x1 -> BN (no ReLU) + # NB: for now this AffineChannel op cannot be in-place due to a bug in C2 + # gradient computation for graphs like this + cur = model.ConvAffine( + cur, + prefix + '_branch2c', + dim_inner, + dim_out, + kernel=1, + stride=1, + pad=0, + inplace=False + ) + return cur + + +def bottleneck_gn_transformation( + model, + blob_in, + dim_in, + dim_out, + stride, + prefix, + dim_inner, + dilation=1, + group=1 +): + """Add a bottleneck transformation with GroupNorm to the model.""" + # In original resnet, stride=2 is on 1x1. + # In fb.torch resnet, stride=2 is on 3x3. + (str1x1, str3x3) = (stride, 1) if cfg.RESNETS.STRIDE_1X1 else (1, stride) + + # conv 1x1 -> GN -> ReLU + cur = model.ConvGN( + blob_in, + prefix + '_branch2a', + dim_in, + dim_inner, + kernel=1, + group_gn=get_group_gn(dim_inner), + stride=str1x1, + pad=0, + ) + cur = model.Relu(cur, cur) + + # conv 3x3 -> GN -> ReLU + cur = model.ConvGN( + cur, + prefix + '_branch2b', + dim_inner, + dim_inner, + kernel=3, + group_gn=get_group_gn(dim_inner), + stride=str3x3, + pad=1 * dilation, + dilation=dilation, + group=group, + ) + cur = model.Relu(cur, cur) + + # conv 1x1 -> GN (no ReLU) + cur = model.ConvGN( + cur, + prefix + '_branch2c', + dim_inner, + dim_out, + kernel=1, + group_gn=get_group_gn(dim_out), + stride=1, + pad=0, + ) + return cur diff --git a/detectron/modeling/VGG16.py b/detectron/modeling/VGG16.py new file mode 100644 index 0000000000000000000000000000000000000000..36454cd6be43631ad232d2b41bbe434dd70bd99d --- /dev/null +++ b/detectron/modeling/VGG16.py @@ -0,0 +1,75 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""VGG16 from https://arxiv.org/abs/1409.1556.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.core.config import cfg + + +def add_VGG16_conv5_body(model): + model.Conv('data', 'conv1_1', 3, 64, 3, pad=1, stride=1) + model.Relu('conv1_1', 'conv1_1') + model.Conv('conv1_1', 'conv1_2', 64, 64, 3, pad=1, stride=1) + model.Relu('conv1_2', 'conv1_2') + model.MaxPool('conv1_2', 'pool1', kernel=2, pad=0, stride=2) + model.Conv('pool1', 'conv2_1', 64, 128, 3, pad=1, stride=1) + model.Relu('conv2_1', 'conv2_1') + model.Conv('conv2_1', 'conv2_2', 128, 128, 3, pad=1, stride=1) + model.Relu('conv2_2', 'conv2_2') + model.MaxPool('conv2_2', 'pool2', kernel=2, pad=0, stride=2) + model.StopGradient('pool2', 'pool2') + model.Conv('pool2', 'conv3_1', 128, 256, 3, pad=1, stride=1) + model.Relu('conv3_1', 'conv3_1') + model.Conv('conv3_1', 'conv3_2', 256, 256, 3, pad=1, stride=1) + model.Relu('conv3_2', 'conv3_2') + model.Conv('conv3_2', 'conv3_3', 256, 256, 3, pad=1, stride=1) + model.Relu('conv3_3', 'conv3_3') + model.MaxPool('conv3_3', 'pool3', kernel=2, pad=0, stride=2) + model.Conv('pool3', 'conv4_1', 256, 512, 3, pad=1, stride=1) + model.Relu('conv4_1', 'conv4_1') + model.Conv('conv4_1', 'conv4_2', 512, 512, 3, pad=1, stride=1) + model.Relu('conv4_2', 'conv4_2') + model.Conv('conv4_2', 'conv4_3', 512, 512, 3, pad=1, stride=1) + model.Relu('conv4_3', 'conv4_3') + model.MaxPool('conv4_3', 'pool4', kernel=2, pad=0, stride=2) + model.Conv('pool4', 'conv5_1', 512, 512, 3, pad=1, stride=1) + model.Relu('conv5_1', 'conv5_1') + model.Conv('conv5_1', 'conv5_2', 512, 512, 3, pad=1, stride=1) + model.Relu('conv5_2', 'conv5_2') + model.Conv('conv5_2', 'conv5_3', 512, 512, 3, pad=1, stride=1) + blob_out = model.Relu('conv5_3', 'conv5_3') + return blob_out, 512, 1. / 16. + + +def add_VGG16_roi_fc_head(model, blob_in, dim_in, spatial_scale): + model.RoIFeatureTransform( + blob_in, + 'pool5', + blob_rois='rois', + method=cfg.FAST_RCNN.ROI_XFORM_METHOD, + resolution=7, + sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + model.FC('pool5', 'fc6', dim_in * 7 * 7, 4096) + model.Relu('fc6', 'fc6') + model.FC('fc6', 'fc7', 4096, 4096) + blob_out = model.Relu('fc7', 'fc7') + return blob_out, 4096 diff --git a/detectron/modeling/VGG_CNN_M_1024.py b/detectron/modeling/VGG_CNN_M_1024.py new file mode 100644 index 0000000000000000000000000000000000000000..0dc9a9eed01e1c185b618468cf0c1b00ba019c8a --- /dev/null +++ b/detectron/modeling/VGG_CNN_M_1024.py @@ -0,0 +1,61 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""VGG_CNN_M_1024 from https://arxiv.org/abs/1405.3531.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.core.config import cfg + + +def add_VGG_CNN_M_1024_conv5_body(model): + model.Conv('data', 'conv1', 3, 96, 7, pad=0, stride=2) + model.Relu('conv1', 'conv1') + model.LRN('conv1', 'norm1', size=5, alpha=0.0005, beta=0.75, bias=2.) + model.MaxPool('norm1', 'pool1', kernel=3, pad=0, stride=2) + model.StopGradient('pool1', 'pool1') + # No updates at conv1 and below (norm1 and pool1 have no params, + # so we can stop gradients before them, too) + model.Conv('pool1', 'conv2', 96, 256, 5, pad=0, stride=2) + model.Relu('conv2', 'conv2') + model.LRN('conv2', 'norm2', size=5, alpha=0.0005, beta=0.75, bias=2.) + model.MaxPool('norm2', 'pool2', kernel=3, pad=0, stride=2) + model.Conv('pool2', 'conv3', 256, 512, 3, pad=1, stride=1) + model.Relu('conv3', 'conv3') + model.Conv('conv3', 'conv4', 512, 512, 3, pad=1, stride=1) + model.Relu('conv4', 'conv4') + model.Conv('conv4', 'conv5', 512, 512, 3, pad=1, stride=1) + blob_out = model.Relu('conv5', 'conv5') + return blob_out, 512, 1. / 16. + + +def add_VGG_CNN_M_1024_roi_fc_head(model, blob_in, dim_in, spatial_scale): + model.RoIFeatureTransform( + blob_in, + 'pool5', + blob_rois='rois', + method=cfg.FAST_RCNN.ROI_XFORM_METHOD, + resolution=6, + sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + model.FC('pool5', 'fc6', dim_in * 6 * 6, 4096) + model.Relu('fc6', 'fc6') + model.FC('fc6', 'fc7', 4096, 1024) + blob_out = model.Relu('fc7', 'fc7') + return blob_out, 1024 diff --git a/detectron/modeling/__init__.py b/detectron/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..01e173336f119a232400cecd0de43cee5656f8ea --- /dev/null +++ b/detectron/modeling/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## diff --git a/detectron/modeling/detector.py b/detectron/modeling/detector.py new file mode 100644 index 0000000000000000000000000000000000000000..ce4e058f3b49f56c2a356d31f750510ccb39d8f6 --- /dev/null +++ b/detectron/modeling/detector.py @@ -0,0 +1,572 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Defines DetectionModelHelper, the class that represents a Detectron model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import logging + +from caffe2.python import cnn +from caffe2.python import core +from caffe2.python import workspace +from caffe2.python.modeling import initializers +from caffe2.python.modeling.parameter_info import ParameterTags + +from detectron.core.config import cfg +from detectron.ops.collect_and_distribute_fpn_rpn_proposals \ + import CollectAndDistributeFpnRpnProposalsOp +from detectron.ops.generate_proposal_labels import GenerateProposalLabelsOp +from detectron.ops.generate_proposals import GenerateProposalsOp +import detectron.roi_data.fast_rcnn as fast_rcnn_roi_data +import detectron.utils.c2 as c2_utils + +logger = logging.getLogger(__name__) + + +class DetectionModelHelper(cnn.CNNModelHelper): + def __init__(self, **kwargs): + # Handle args specific to the DetectionModelHelper, others pass through + # to CNNModelHelper + self.train = kwargs.get('train', False) + self.num_classes = kwargs.get('num_classes', -1) + assert self.num_classes > 0, 'num_classes must be > 0' + for k in ('train', 'num_classes'): + if k in kwargs: + del kwargs[k] + kwargs['order'] = 'NCHW' + # Defensively set cudnn_exhaustive_search to False in case the default + # changes in CNNModelHelper. The detection code uses variable size + # inputs that might not play nicely with cudnn_exhaustive_search. + kwargs['cudnn_exhaustive_search'] = False + super(DetectionModelHelper, self).__init__(**kwargs) + self.roi_data_loader = None + self.losses = [] + self.metrics = [] + self.do_not_update_params = [] # Param on this list are not updated + self.net.Proto().type = cfg.MODEL.EXECUTION_TYPE + self.net.Proto().num_workers = cfg.NUM_GPUS * 4 + self.prev_use_cudnn = self.use_cudnn + self.gn_params = [] # Param on this list are GroupNorm parameters + + def TrainableParams(self, gpu_id=-1): + """Get the blob names for all trainable parameters, possibly filtered by + GPU id. + """ + return [ + p for p in self.params + if ( + p in self.param_to_grad and # p has a gradient + p not in self.do_not_update_params and # not on the blacklist + (gpu_id == -1 or # filter for gpu assignment, if gpu_id set + str(p).find('gpu_{}'.format(gpu_id)) == 0) + )] + + def AffineChannel(self, blob_in, blob_out, dim, inplace=False): + """Affine transformation to replace BN in networks where BN cannot be + used (e.g., because the minibatch size is too small). + + The operations can be done in place to save memory. + """ + blob_out = blob_out or self.net.NextName() + param_prefix = blob_out + + scale = self.create_param( + param_name=param_prefix + '_s', + initializer=initializers.Initializer("ConstantFill", value=1.), + tags=ParameterTags.WEIGHT, + shape=[dim, ], + ) + bias = self.create_param( + param_name=param_prefix + '_b', + initializer=initializers.Initializer("ConstantFill", value=0.), + tags=ParameterTags.BIAS, + shape=[dim, ], + ) + if inplace: + return self.net.AffineChannel([blob_in, scale, bias], blob_in) + else: + return self.net.AffineChannel([blob_in, scale, bias], blob_out) + + def GenerateProposals(self, blobs_in, blobs_out, anchors, spatial_scale): + """Op for generating RPN porposals. + + blobs_in: + - 'rpn_cls_probs': 4D tensor of shape (N, A, H, W), where N is the + number of minibatch images, A is the number of anchors per + locations, and (H, W) is the spatial size of the prediction grid. + Each value represents a "probability of object" rating in [0, 1]. + - 'rpn_bbox_pred': 4D tensor of shape (N, 4 * A, H, W) of predicted + deltas for transformation anchor boxes into RPN proposals. + - 'im_info': 2D tensor of shape (N, 3) where the three columns encode + the input image's [height, width, scale]. Height and width are + for the input to the network, not the original image; scale is the + scale factor used to scale the original image to the network input + size. + + blobs_out: + - 'rpn_rois': 2D tensor of shape (R, 5), for R RPN proposals where the + five columns encode [batch ind, x1, y1, x2, y2]. The boxes are + w.r.t. the network input, which is a *scaled* version of the + original image; these proposals must be scaled by 1 / scale (where + scale comes from im_info; see above) to transform it back to the + original input image coordinate system. + - 'rpn_roi_probs': 1D tensor of objectness probability scores + (extracted from rpn_cls_probs; see above). + """ + cfg_key = 'TRAIN' if self.train else 'TEST' + + if cfg[cfg_key].GENERATE_PROPOSALS_ON_GPU: + rpn_pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N + rpn_post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N + rpn_nms_thresh = cfg[cfg_key].RPN_NMS_THRESH + rpn_min_size = float(cfg[cfg_key].RPN_MIN_SIZE) + + input_name = str(blobs_in[0]) + lvl = int(input_name[-1]) if input_name[-1].isdigit() else None + anchors_name = 'anchors{}'.format(lvl) if lvl else 'anchors' + + for i in range(cfg.NUM_GPUS): + with c2_utils.CudaScope(i): + workspace.FeedBlob( + 'gpu_{}/{}'.format(i, anchors_name), + anchors.astype(np.float32)) + + self.net.GenerateProposals( + blobs_in + [anchors_name], + blobs_out, + spatial_scale=spatial_scale, + pre_nms_topN=rpn_pre_nms_topN, + post_nms_topN=rpn_post_nms_topN, + nms_thresh=rpn_nms_thresh, + min_size=rpn_min_size, + ) + else: + name = 'GenerateProposalsOp:' + ','.join([str(b) for b in blobs_in]) + # spatial_scale passed to the Python op is only used in + # convert_pkl_to_pb + self.net.Python( + GenerateProposalsOp(anchors, spatial_scale, self.train).forward + )(blobs_in, blobs_out, name=name, spatial_scale=spatial_scale) + + return blobs_out + + def GenerateProposalLabels(self, blobs_in): + """Op for generating training labels for RPN proposals. This is used + when training RPN jointly with Fast/Mask R-CNN (as in end-to-end + Faster R-CNN training). + + blobs_in: + - 'rpn_rois': 2D tensor of RPN proposals output by GenerateProposals + - 'roidb': roidb entries that will be labeled + - 'im_info': See GenerateProposals doc. + + blobs_out: + - (variable set of blobs): returns whatever blobs are required for + training the model. It does this by querying the data loader for + the list of blobs that are needed. + """ + name = 'GenerateProposalLabelsOp:' + ','.join( + [str(b) for b in blobs_in] + ) + + # The list of blobs is not known before run-time because it depends on + # the specific model being trained. Query the data loader to get the + # list of output blob names. + blobs_out = fast_rcnn_roi_data.get_fast_rcnn_blob_names( + is_training=self.train + ) + blobs_out = [core.ScopedBlobReference(b) for b in blobs_out] + + self.net.Python(GenerateProposalLabelsOp().forward)( + blobs_in, blobs_out, name=name + ) + return blobs_out + + def CollectAndDistributeFpnRpnProposals(self): + """Merge RPN proposals generated at multiple FPN levels and then + distribute those proposals to their appropriate FPN levels. An anchor + at one FPN level may predict an RoI that will map to another level, + hence the need to redistribute the proposals. + + This function assumes standard blob names for input and output blobs. + + Input blobs: [rpn_rois_fpn, ..., rpn_rois_fpn, + rpn_roi_probs_fpn, ..., rpn_roi_probs_fpn] + - rpn_rois_fpn are the RPN proposals for FPN level i; see rpn_rois + documentation from GenerateProposals. + - rpn_roi_probs_fpn are the RPN objectness probabilities for FPN + level i; see rpn_roi_probs documentation from GenerateProposals. + + If used during training, then the input blobs will also include: + [roidb, im_info] (see GenerateProposalLabels). + + Output blobs: [rois_fpn, ..., rois_rpn, rois, + rois_idx_restore] + - rois_fpn are the RPN proposals for FPN level i + - rois_idx_restore is a permutation on the concatenation of all + rois_fpn, i=min...max, such that when applied the RPN RoIs are + restored to their original order in the input blobs. + + If used during training, then the output blobs will also include: + [labels, bbox_targets, bbox_inside_weights, bbox_outside_weights]. + """ + k_max = cfg.FPN.RPN_MAX_LEVEL + k_min = cfg.FPN.RPN_MIN_LEVEL + + # Prepare input blobs + rois_names = ['rpn_rois_fpn' + str(l) for l in range(k_min, k_max + 1)] + score_names = [ + 'rpn_roi_probs_fpn' + str(l) for l in range(k_min, k_max + 1) + ] + blobs_in = rois_names + score_names + if self.train: + blobs_in += ['roidb', 'im_info'] + blobs_in = [core.ScopedBlobReference(b) for b in blobs_in] + name = 'CollectAndDistributeFpnRpnProposalsOp:' + ','.join( + [str(b) for b in blobs_in] + ) + + # Prepare output blobs + blobs_out = fast_rcnn_roi_data.get_fast_rcnn_blob_names( + is_training=self.train + ) + blobs_out = [core.ScopedBlobReference(b) for b in blobs_out] + + outputs = self.net.Python( + CollectAndDistributeFpnRpnProposalsOp(self.train).forward + )(blobs_in, blobs_out, name=name) + + return outputs + + def DropoutIfTraining(self, blob_in, dropout_rate): + """Add dropout to blob_in if the model is in training mode and + dropout_rate is > 0.""" + blob_out = blob_in + if self.train and dropout_rate > 0: + blob_out = self.Dropout( + blob_in, blob_in, ratio=dropout_rate, is_test=False + ) + return blob_out + + def RoIFeatureTransform( + self, + blobs_in, + blob_out, + blob_rois='rois', + method='RoIPoolF', + resolution=7, + spatial_scale=1. / 16., + sampling_ratio=0 + ): + """Add the specified RoI pooling method. The sampling_ratio argument + is supported for some, but not all, RoI transform methods. + + RoIFeatureTransform abstracts away: + - Use of FPN or not + - Specifics of the transform method + """ + assert method in {'RoIPoolF', 'RoIAlign'}, \ + 'Unknown pooling method: {}'.format(method) + has_argmax = (method == 'RoIPoolF') + if isinstance(blobs_in, list): + # FPN case: add RoIFeatureTransform to each FPN level + k_max = cfg.FPN.ROI_MAX_LEVEL # coarsest level of pyramid + k_min = cfg.FPN.ROI_MIN_LEVEL # finest level of pyramid + assert len(blobs_in) == k_max - k_min + 1 + bl_out_list = [] + for lvl in range(k_min, k_max + 1): + bl_in = blobs_in[k_max - lvl] # blobs_in is in reversed order + sc = spatial_scale[k_max - lvl] # in reversed order + bl_rois = blob_rois + '_fpn' + str(lvl) + bl_out = blob_out + '_fpn' + str(lvl) + bl_out_list.append(bl_out) + bl_argmax = ['_argmax_' + bl_out] if has_argmax else [] + self.net.__getattr__(method)( + [bl_in, bl_rois], [bl_out] + bl_argmax, + pooled_w=resolution, + pooled_h=resolution, + spatial_scale=sc, + sampling_ratio=sampling_ratio + ) + # The pooled features from all levels are concatenated along the + # batch dimension into a single 4D tensor. + xform_shuffled, _ = self.net.Concat( + bl_out_list, [blob_out + '_shuffled', '_concat_' + blob_out], + axis=0 + ) + # Unshuffle to match rois from dataloader + restore_bl = blob_rois + '_idx_restore_int32' + xform_out = self.net.BatchPermutation( + [xform_shuffled, restore_bl], blob_out + ) + else: + # Single feature level + bl_argmax = ['_argmax_' + blob_out] if has_argmax else [] + # sampling_ratio is ignored for RoIPoolF + xform_out = self.net.__getattr__(method)( + [blobs_in, blob_rois], [blob_out] + bl_argmax, + pooled_w=resolution, + pooled_h=resolution, + spatial_scale=spatial_scale, + sampling_ratio=sampling_ratio + ) + # Only return the first blob (the transformed features) + return xform_out[0] if isinstance(xform_out, tuple) else xform_out + + def ConvShared( + self, + blob_in, + blob_out, + dim_in, + dim_out, + kernel, + weight=None, + bias=None, + **kwargs + ): + """Add conv op that shares weights and/or biases with another conv op. + """ + use_bias = ( + False if ('no_bias' in kwargs and kwargs['no_bias']) else True + ) + + if self.use_cudnn: + kwargs['engine'] = 'CUDNN' + kwargs['exhaustive_search'] = self.cudnn_exhaustive_search + if self.ws_nbytes_limit: + kwargs['ws_nbytes_limit'] = self.ws_nbytes_limit + + if use_bias: + blobs_in = [blob_in, weight, bias] + else: + blobs_in = [blob_in, weight] + + if 'no_bias' in kwargs: + del kwargs['no_bias'] + + return self.net.Conv( + blobs_in, blob_out, kernel=kernel, order=self.order, **kwargs + ) + + def BilinearInterpolation( + self, blob_in, blob_out, dim_in, dim_out, up_scale + ): + """Bilinear interpolation in space of scale. + + Takes input of NxKxHxW and outputs NxKx(sH)x(sW), where s:= up_scale + + Adapted from the CVPR'15 FCN code. + See: https://github.com/shelhamer/fcn.berkeleyvision.org/blob/master/surgery.py + """ + assert dim_in == dim_out + assert up_scale % 2 == 0, 'Scale should be even' + + def upsample_filt(size): + factor = (size + 1) // 2 + if size % 2 == 1: + center = factor - 1 + else: + center = factor - 0.5 + og = np.ogrid[:size, :size] + return ((1 - abs(og[0] - center) / factor) * + (1 - abs(og[1] - center) / factor)) + + kernel_size = up_scale * 2 + bil_filt = upsample_filt(kernel_size) + + kernel = np.zeros( + (dim_in, dim_out, kernel_size, kernel_size), dtype=np.float32 + ) + kernel[range(dim_out), range(dim_in), :, :] = bil_filt + + blob = self.ConvTranspose( + blob_in, + blob_out, + dim_in, + dim_out, + kernel_size, + stride=int(up_scale), + pad=int(up_scale / 2), + weight_init=('GivenTensorFill', {'values': kernel}), + bias_init=('ConstantFill', {'value': 0.}) + ) + self.do_not_update_params.append(self.weights[-1]) + self.do_not_update_params.append(self.biases[-1]) + return blob + + def ConvAffine( # args in the same order of Conv() + self, blob_in, prefix, dim_in, dim_out, kernel, stride, pad, + group=1, dilation=1, + weight_init=None, + bias_init=None, + suffix='_bn', + inplace=False + ): + """ConvAffine adds a Conv op followed by a AffineChannel op (which + replaces BN during fine tuning). + """ + conv_blob = self.Conv( + blob_in, + prefix, + dim_in, + dim_out, + kernel, + stride=stride, + pad=pad, + group=group, + dilation=dilation, + weight_init=weight_init, + bias_init=bias_init, + no_bias=1 + ) + blob_out = self.AffineChannel( + conv_blob, prefix + suffix, dim=dim_out, inplace=inplace + ) + return blob_out + + def ConvGN( # args in the same order of Conv() + self, blob_in, prefix, dim_in, dim_out, kernel, stride, pad, + group_gn, # num of groups in gn + group=1, dilation=1, + weight_init=None, + bias_init=None, + suffix='_gn', + no_conv_bias=1, + ): + """ConvGN adds a Conv op followed by a GroupNorm op, + including learnable scale/bias (gamma/beta) + """ + conv_blob = self.Conv( + blob_in, + prefix, + dim_in, + dim_out, + kernel, + stride=stride, + pad=pad, + group=group, + dilation=dilation, + weight_init=weight_init, + bias_init=bias_init, + no_bias=no_conv_bias) + + if group_gn < 1: + logger.warning( + 'Layer: {} (dim {}): ' + 'group_gn < 1; reset to 1.'.format(prefix, dim_in) + ) + group_gn = 1 + + blob_out = self.SpatialGN( + conv_blob, prefix + suffix, + dim_out, group=group_gn, # op's arg name is "group" + epsilon=cfg.GROUP_NORM.EPSILON,) + + self.gn_params.append(self.params[-1]) # add gn's bias to list + self.gn_params.append(self.params[-2]) # add gn's scale to list + return blob_out + + def DisableCudnn(self): + self.prev_use_cudnn = self.use_cudnn + self.use_cudnn = False + + def RestorePreviousUseCudnn(self): + prev_use_cudnn = self.use_cudnn + self.use_cudnn = self.prev_use_cudnn + self.prev_use_cudnn = prev_use_cudnn + + def UpdateWorkspaceLr(self, cur_iter, new_lr): + """Updates the model's current learning rate and the workspace (learning + rate and update history/momentum blobs). + """ + # The workspace is the one source of truth for the lr + # The lr is always the same on all GPUs + cur_lr = workspace.FetchBlob('gpu_0/lr')[0] + # There are no type conversions between the lr in Python and the lr in + # the GPU (both are float32), so exact comparision is ok + if cur_lr != new_lr: + ratio = _get_lr_change_ratio(cur_lr, new_lr) + if ratio > cfg.SOLVER.LOG_LR_CHANGE_THRESHOLD: + logger.info( + 'Changing learning rate {:.6f} -> {:.6f} at iter {:d}'. + format(cur_lr, new_lr, cur_iter)) + self._SetNewLr(cur_lr, new_lr) + return new_lr + + def _SetNewLr(self, cur_lr, new_lr): + """Do the actual work of updating the model and workspace blobs. + """ + for i in range(cfg.NUM_GPUS): + with c2_utils.CudaScope(i): + workspace.FeedBlob( + 'gpu_{}/lr'.format(i), np.array([new_lr], dtype=np.float32)) + ratio = _get_lr_change_ratio(cur_lr, new_lr) + if cfg.SOLVER.SCALE_MOMENTUM and cur_lr > 1e-7 and \ + ratio > cfg.SOLVER.SCALE_MOMENTUM_THRESHOLD: + self._CorrectMomentum(new_lr / cur_lr) + + def _CorrectMomentum(self, correction): + """The MomentumSGDUpdate op implements the update V as + + V := mu * V + lr * grad, + + where mu is the momentum factor, lr is the learning rate, and grad is + the stochastic gradient. Since V is not defined independently of the + learning rate (as it should ideally be), when the learning rate is + changed we should scale the update history V in order to make it + compatible in scale with lr * grad. + """ + logger.info( + 'Scaling update history by {:.6f} (new lr / old lr)'. + format(correction)) + for i in range(cfg.NUM_GPUS): + with c2_utils.CudaScope(i): + for param in self.TrainableParams(gpu_id=i): + op = core.CreateOperator( + 'Scale', [param + '_momentum'], [param + '_momentum'], + scale=correction) + workspace.RunOperatorOnce(op) + + def GetLossScale(self): + """Allow a way to configure the loss scale dynamically. + + This may be used in a distributed data parallel setting. + """ + return 1.0 / cfg.NUM_GPUS + + def AddLosses(self, losses): + if not isinstance(losses, list): + losses = [losses] + # Conversion to str allows losses to include BlobReferences + losses = [c2_utils.UnscopeName(str(l)) for l in losses] + self.losses = list(set(self.losses + losses)) + + def AddMetrics(self, metrics): + if not isinstance(metrics, list): + metrics = [metrics] + self.metrics = list(set(self.metrics + metrics)) + + +def _get_lr_change_ratio(cur_lr, new_lr): + eps = 1e-10 + ratio = np.max( + (new_lr / np.max((cur_lr, eps)), cur_lr / np.max((new_lr, eps))) + ) + return ratio diff --git a/detectron/modeling/fast_rcnn_heads.py b/detectron/modeling/fast_rcnn_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..eb3a534e8c4922570c5f314999f6d120ecd9213a --- /dev/null +++ b/detectron/modeling/fast_rcnn_heads.py @@ -0,0 +1,178 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Various network "heads" for classification and bounding box prediction. + +The design is as follows: + +... -> RoI ----\ /-> box cls output -> cls loss + -> RoIFeatureXform -> box head +... -> Feature / \-> box reg output -> reg loss + Map + +The Fast R-CNN head produces a feature representation of the RoI for the purpose +of bounding box classification and regression. The box output module converts +the feature representation into classification and regression predictions. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.core.config import cfg +from detectron.utils.c2 import const_fill +from detectron.utils.c2 import gauss_fill +from detectron.utils.net import get_group_gn +import detectron.utils.blob as blob_utils + + +# ---------------------------------------------------------------------------- # +# Fast R-CNN outputs and losses +# ---------------------------------------------------------------------------- # + +def add_fast_rcnn_outputs(model, blob_in, dim): + """Add RoI classification and bounding box regression output ops.""" + # Box classification layer + model.FC( + blob_in, + 'cls_score', + dim, + model.num_classes, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + if not model.train: # == if test + # Only add softmax when testing; during training the softmax is combined + # with the label cross entropy loss for numerical stability + model.Softmax('cls_score', 'cls_prob', engine='CUDNN') + # Box regression layer + num_bbox_reg_classes = ( + 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else model.num_classes + ) + model.FC( + blob_in, + 'bbox_pred', + dim, + num_bbox_reg_classes * 4, + weight_init=gauss_fill(0.001), + bias_init=const_fill(0.0) + ) + + +def add_fast_rcnn_losses(model): + """Add losses for RoI classification and bounding box regression.""" + cls_prob, loss_cls = model.net.SoftmaxWithLoss( + ['cls_score', 'labels_int32'], ['cls_prob', 'loss_cls'], + scale=model.GetLossScale() + ) + loss_bbox = model.net.SmoothL1Loss( + [ + 'bbox_pred', 'bbox_targets', 'bbox_inside_weights', + 'bbox_outside_weights' + ], + 'loss_bbox', + scale=model.GetLossScale() + ) + loss_gradients = blob_utils.get_loss_gradients(model, [loss_cls, loss_bbox]) + model.Accuracy(['cls_prob', 'labels_int32'], 'accuracy_cls') + model.AddLosses(['loss_cls', 'loss_bbox']) + model.AddMetrics('accuracy_cls') + return loss_gradients + + +# ---------------------------------------------------------------------------- # +# Box heads +# ---------------------------------------------------------------------------- # + +def add_roi_2mlp_head(model, blob_in, dim_in, spatial_scale): + """Add a ReLU MLP with two hidden layers.""" + hidden_dim = cfg.FAST_RCNN.MLP_HEAD_DIM + roi_size = cfg.FAST_RCNN.ROI_XFORM_RESOLUTION + roi_feat = model.RoIFeatureTransform( + blob_in, + 'roi_feat', + blob_rois='rois', + method=cfg.FAST_RCNN.ROI_XFORM_METHOD, + resolution=roi_size, + sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + model.FC(roi_feat, 'fc6', dim_in * roi_size * roi_size, hidden_dim) + model.Relu('fc6', 'fc6') + model.FC('fc6', 'fc7', hidden_dim, hidden_dim) + model.Relu('fc7', 'fc7') + return 'fc7', hidden_dim + + +def add_roi_Xconv1fc_head(model, blob_in, dim_in, spatial_scale): + """Add a X conv + 1fc head, as a reference if not using GroupNorm""" + hidden_dim = cfg.FAST_RCNN.CONV_HEAD_DIM + roi_size = cfg.FAST_RCNN.ROI_XFORM_RESOLUTION + roi_feat = model.RoIFeatureTransform( + blob_in, + 'roi_feat', + blob_rois='rois', + method=cfg.FAST_RCNN.ROI_XFORM_METHOD, + resolution=roi_size, + sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + + current = roi_feat + for i in range(cfg.FAST_RCNN.NUM_STACKED_CONVS): + current = model.Conv( + current, 'head_conv' + str(i + 1), dim_in, hidden_dim, 3, + stride=1, pad=1, + weight_init=('MSRAFill', {}), + bias_init=('ConstantFill', {'value': 0.}), + no_bias=0) + current = model.Relu(current, current) + dim_in = hidden_dim + + fc_dim = cfg.FAST_RCNN.MLP_HEAD_DIM + model.FC(current, 'fc6', dim_in * roi_size * roi_size, fc_dim) + model.Relu('fc6', 'fc6') + return 'fc6', fc_dim + + +def add_roi_Xconv1fc_gn_head(model, blob_in, dim_in, spatial_scale): + """Add a X conv + 1fc head, with GroupNorm""" + hidden_dim = cfg.FAST_RCNN.CONV_HEAD_DIM + roi_size = cfg.FAST_RCNN.ROI_XFORM_RESOLUTION + roi_feat = model.RoIFeatureTransform( + blob_in, 'roi_feat', + blob_rois='rois', + method=cfg.FAST_RCNN.ROI_XFORM_METHOD, + resolution=roi_size, + sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + + current = roi_feat + for i in range(cfg.FAST_RCNN.NUM_STACKED_CONVS): + current = model.ConvGN( + current, 'head_conv' + str(i + 1), dim_in, hidden_dim, 3, + group_gn=get_group_gn(hidden_dim), + stride=1, pad=1, + weight_init=('MSRAFill', {}), + bias_init=('ConstantFill', {'value': 0.})) + current = model.Relu(current, current) + dim_in = hidden_dim + + fc_dim = cfg.FAST_RCNN.MLP_HEAD_DIM + model.FC(current, 'fc6', dim_in * roi_size * roi_size, fc_dim) + model.Relu('fc6', 'fc6') + return 'fc6', fc_dim diff --git a/detectron/modeling/generate_anchors.py b/detectron/modeling/generate_anchors.py new file mode 100644 index 0000000000000000000000000000000000000000..f6b7a1ab63a8cd2da89d59f2a769d2b6900f08aa --- /dev/null +++ b/detectron/modeling/generate_anchors.py @@ -0,0 +1,123 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Sean Bell +# -------------------------------------------------------- + +import numpy as np + +# Verify that we compute the same anchors as Shaoqing's matlab implementation: +# +# >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat +# >> anchors +# +# anchors = +# +# -83 -39 100 56 +# -175 -87 192 104 +# -359 -183 376 200 +# -55 -55 72 72 +# -119 -119 136 136 +# -247 -247 264 264 +# -35 -79 52 96 +# -79 -167 96 184 +# -167 -343 184 360 + +# array([[ -83., -39., 100., 56.], +# [-175., -87., 192., 104.], +# [-359., -183., 376., 200.], +# [ -55., -55., 72., 72.], +# [-119., -119., 136., 136.], +# [-247., -247., 264., 264.], +# [ -35., -79., 52., 96.], +# [ -79., -167., 96., 184.], +# [-167., -343., 184., 360.]]) + + +def generate_anchors( + stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2) +): + """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors + are centered on stride / 2, have (approximate) sqrt areas of the specified + sizes, and aspect ratios as given. + """ + return _generate_anchors( + stride, + np.array(sizes, dtype=np.float) / stride, + np.array(aspect_ratios, dtype=np.float) + ) + + +def _generate_anchors(base_size, scales, aspect_ratios): + """Generate anchor (reference) windows by enumerating aspect ratios X + scales wrt a reference (0, 0, base_size - 1, base_size - 1) window. + """ + anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1 + anchors = _ratio_enum(anchor, aspect_ratios) + anchors = np.vstack( + [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])] + ) + return anchors + + +def _whctrs(anchor): + """Return width, height, x center, and y center for an anchor (window).""" + w = anchor[2] - anchor[0] + 1 + h = anchor[3] - anchor[1] + 1 + x_ctr = anchor[0] + 0.5 * (w - 1) + y_ctr = anchor[1] + 0.5 * (h - 1) + return w, h, x_ctr, y_ctr + + +def _mkanchors(ws, hs, x_ctr, y_ctr): + """Given a vector of widths (ws) and heights (hs) around a center + (x_ctr, y_ctr), output a set of anchors (windows). + """ + ws = ws[:, np.newaxis] + hs = hs[:, np.newaxis] + anchors = np.hstack( + ( + x_ctr - 0.5 * (ws - 1), + y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), + y_ctr + 0.5 * (hs - 1) + ) + ) + return anchors + + +def _ratio_enum(anchor, ratios): + """Enumerate a set of anchors for each aspect ratio wrt an anchor.""" + w, h, x_ctr, y_ctr = _whctrs(anchor) + size = w * h + size_ratios = size / ratios + ws = np.round(np.sqrt(size_ratios)) + hs = np.round(ws * ratios) + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + + +def _scale_enum(anchor, scales): + """Enumerate a set of anchors for each scale wrt an anchor.""" + w, h, x_ctr, y_ctr = _whctrs(anchor) + ws = w * scales + hs = h * scales + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors diff --git a/detectron/modeling/keypoint_rcnn_heads.py b/detectron/modeling/keypoint_rcnn_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..edc095d0b00d5ea0d6aad27dc901072cf52a4e42 --- /dev/null +++ b/detectron/modeling/keypoint_rcnn_heads.py @@ -0,0 +1,217 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Various network "heads" for predicting keypoints in Mask R-CNN. + +The design is as follows: + +... -> RoI ----\ + -> RoIFeatureXform -> keypoint head -> keypoint output -> loss +... -> Feature / + Map + +The keypoint head produces a feature representation of the RoI for the purpose +of keypoint prediction. The keypoint output module converts the feature +representation into keypoint heatmaps. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.core.config import cfg +from detectron.utils.c2 import const_fill +from detectron.utils.c2 import gauss_fill +import detectron.modeling.ResNet as ResNet +import detectron.utils.blob as blob_utils + + +# ---------------------------------------------------------------------------- # +# Keypoint R-CNN outputs and losses +# ---------------------------------------------------------------------------- # + +def add_keypoint_outputs(model, blob_in, dim): + """Add Mask R-CNN keypoint specific outputs: keypoint heatmaps.""" + # NxKxHxW + upsample_heatmap = (cfg.KRCNN.UP_SCALE > 1) + + if cfg.KRCNN.USE_DECONV: + # Apply ConvTranspose to the feature representation; results in 2x + # upsampling + blob_in = model.ConvTranspose( + blob_in, + 'kps_deconv', + dim, + cfg.KRCNN.DECONV_DIM, + kernel=cfg.KRCNN.DECONV_KERNEL, + pad=int(cfg.KRCNN.DECONV_KERNEL / 2 - 1), + stride=2, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + model.Relu('kps_deconv', 'kps_deconv') + dim = cfg.KRCNN.DECONV_DIM + + if upsample_heatmap: + blob_name = 'kps_score_lowres' + else: + blob_name = 'kps_score' + + if cfg.KRCNN.USE_DECONV_OUTPUT: + # Use ConvTranspose to predict heatmaps; results in 2x upsampling + blob_out = model.ConvTranspose( + blob_in, + blob_name, + dim, + cfg.KRCNN.NUM_KEYPOINTS, + kernel=cfg.KRCNN.DECONV_KERNEL, + pad=int(cfg.KRCNN.DECONV_KERNEL / 2 - 1), + stride=2, + weight_init=(cfg.KRCNN.CONV_INIT, {'std': 0.001}), + bias_init=const_fill(0.0) + ) + else: + # Use Conv to predict heatmaps; does no upsampling + blob_out = model.Conv( + blob_in, + blob_name, + dim, + cfg.KRCNN.NUM_KEYPOINTS, + kernel=1, + pad=0, + stride=1, + weight_init=(cfg.KRCNN.CONV_INIT, {'std': 0.001}), + bias_init=const_fill(0.0) + ) + + if upsample_heatmap: + # Increase heatmap output size via bilinear upsampling + blob_out = model.BilinearInterpolation( + blob_out, 'kps_score', cfg.KRCNN.NUM_KEYPOINTS, + cfg.KRCNN.NUM_KEYPOINTS, cfg.KRCNN.UP_SCALE + ) + + return blob_out + + +def add_keypoint_losses(model): + """Add Mask R-CNN keypoint specific losses.""" + # Reshape input from (N, K, H, W) to (NK, HW) + model.net.Reshape( + ['kps_score'], ['kps_score_reshaped', '_kps_score_old_shape'], + shape=(-1, cfg.KRCNN.HEATMAP_SIZE * cfg.KRCNN.HEATMAP_SIZE) + ) + # Softmax across **space** (woahh....space!) + # Note: this is not what is commonly called "spatial softmax" + # (i.e., softmax applied along the channel dimension at each spatial + # location); This is softmax applied over a set of spatial locations (i.e., + # each spatial location is a "class"). + kps_prob, loss_kps = model.net.SoftmaxWithLoss( + ['kps_score_reshaped', 'keypoint_locations_int32', 'keypoint_weights'], + ['kps_prob', 'loss_kps'], + scale=cfg.KRCNN.LOSS_WEIGHT / cfg.NUM_GPUS, + spatial=0 + ) + if not cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS: + # Discussion: the softmax loss above will average the loss by the sum of + # keypoint_weights, i.e. the total number of visible keypoints. Since + # the number of visible keypoints can vary significantly between + # minibatches, this has the effect of up-weighting the importance of + # minibatches with few visible keypoints. (Imagine the extreme case of + # only one visible keypoint versus N: in the case of N, each one + # contributes 1/N to the gradient compared to the single keypoint + # determining the gradient direction). Instead, we can normalize the + # loss by the total number of keypoints, if it were the case that all + # keypoints were visible in a full minibatch. (Returning to the example, + # this means that the one visible keypoint contributes as much as each + # of the N keypoints.) + model.StopGradient( + 'keypoint_loss_normalizer', 'keypoint_loss_normalizer' + ) + loss_kps = model.net.Mul( + ['loss_kps', 'keypoint_loss_normalizer'], 'loss_kps_normalized' + ) + loss_gradients = blob_utils.get_loss_gradients(model, [loss_kps]) + model.AddLosses(loss_kps) + return loss_gradients + + +# ---------------------------------------------------------------------------- # +# Keypoint heads +# ---------------------------------------------------------------------------- # + +def add_ResNet_roi_conv5_head_for_keypoints( + model, blob_in, dim_in, spatial_scale +): + """Add a ResNet "conv5" / "stage5" head for Mask R-CNN keypoint prediction. + """ + model.RoIFeatureTransform( + blob_in, + '_[pose]_pool5', + blob_rois='keypoint_rois', + method=cfg.KRCNN.ROI_XFORM_METHOD, + resolution=cfg.KRCNN.ROI_XFORM_RESOLUTION, + sampling_ratio=cfg.KRCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + # Using the prefix '_[pose]_' to 'res5' enables initializing the head's + # parameters using pretrained 'res5' parameters if given (see + # utils.net.initialize_from_weights_file) + s, dim_in = ResNet.add_stage( + model, + '_[pose]_res5', + '_[pose]_pool5', + 3, + dim_in, + 2048, + 512, + cfg.KRCNN.DILATION, + stride_init=int(cfg.KRCNN.ROI_XFORM_RESOLUTION / 7) + ) + return s, 2048 + + +def add_roi_pose_head_v1convX(model, blob_in, dim_in, spatial_scale): + """Add a Mask R-CNN keypoint head. v1convX design: X * (conv).""" + hidden_dim = cfg.KRCNN.CONV_HEAD_DIM + kernel_size = cfg.KRCNN.CONV_HEAD_KERNEL + pad_size = kernel_size // 2 + current = model.RoIFeatureTransform( + blob_in, + '_[pose]_roi_feat', + blob_rois='keypoint_rois', + method=cfg.KRCNN.ROI_XFORM_METHOD, + resolution=cfg.KRCNN.ROI_XFORM_RESOLUTION, + sampling_ratio=cfg.KRCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + + for i in range(cfg.KRCNN.NUM_STACKED_CONVS): + current = model.Conv( + current, + 'conv_fcn' + str(i + 1), + dim_in, + hidden_dim, + kernel_size, + stride=1, + pad=pad_size, + weight_init=(cfg.KRCNN.CONV_INIT, {'std': 0.01}), + bias_init=('ConstantFill', {'value': 0.}) + ) + current = model.Relu(current, current) + dim_in = hidden_dim + + return current, hidden_dim diff --git a/detectron/modeling/mask_rcnn_heads.py b/detectron/modeling/mask_rcnn_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..bf76e83fa293845648344e0a3e7bd5c7abe0e7d8 --- /dev/null +++ b/detectron/modeling/mask_rcnn_heads.py @@ -0,0 +1,329 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Various network "heads" for predicting masks in Mask R-CNN. + +The design is as follows: + +... -> RoI ----\ + -> RoIFeatureXform -> mask head -> mask output -> loss +... -> Feature / + Map + +The mask head produces a feature representation of the RoI for the purpose +of mask prediction. The mask output module converts the feature representation +into real-valued (soft) masks. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.core.config import cfg +from detectron.utils.c2 import const_fill +from detectron.utils.c2 import gauss_fill +from detectron.utils.net import get_group_gn +import detectron.modeling.ResNet as ResNet +import detectron.utils.blob as blob_utils + + +# ---------------------------------------------------------------------------- # +# Mask R-CNN outputs and losses +# ---------------------------------------------------------------------------- # + +def add_mask_rcnn_outputs(model, blob_in, dim): + """Add Mask R-CNN specific outputs: either mask logits or probs.""" + num_cls = cfg.MODEL.NUM_CLASSES if cfg.MRCNN.CLS_SPECIFIC_MASK else 1 + + if cfg.MRCNN.USE_FC_OUTPUT: + # Predict masks with a fully connected layer (ignore 'fcn' in the blob + # name) + dim_fc = int(dim * (cfg.MRCNN.RESOLUTION / cfg.MRCNN.UPSAMPLE_RATIO)**2) + blob_out = model.FC( + blob_in, + 'mask_fcn_logits', + dim_fc, + num_cls * cfg.MRCNN.RESOLUTION**2, + weight_init=gauss_fill(0.001), + bias_init=const_fill(0.0) + ) + else: + # Predict mask using Conv + + # Use GaussianFill for class-agnostic mask prediction; fills based on + # fan-in can be too large in this case and cause divergence + fill = ( + cfg.MRCNN.CONV_INIT + if cfg.MRCNN.CLS_SPECIFIC_MASK else 'GaussianFill' + ) + blob_out = model.Conv( + blob_in, + 'mask_fcn_logits', + dim, + num_cls, + kernel=1, + pad=0, + stride=1, + weight_init=(fill, {'std': 0.001}), + bias_init=const_fill(0.0) + ) + + if cfg.MRCNN.UPSAMPLE_RATIO > 1: + blob_out = model.BilinearInterpolation( + 'mask_fcn_logits', 'mask_fcn_logits_up', num_cls, num_cls, + cfg.MRCNN.UPSAMPLE_RATIO + ) + + if not model.train: # == if test + blob_out = model.net.Sigmoid(blob_out, 'mask_fcn_probs') + + return blob_out + + +def add_mask_rcnn_losses(model, blob_mask): + """Add Mask R-CNN specific losses.""" + loss_mask = model.net.SigmoidCrossEntropyLoss( + [blob_mask, 'masks_int32'], + 'loss_mask', + scale=model.GetLossScale() * cfg.MRCNN.WEIGHT_LOSS_MASK + ) + loss_gradients = blob_utils.get_loss_gradients(model, [loss_mask]) + model.AddLosses('loss_mask') + return loss_gradients + + +# ---------------------------------------------------------------------------- # +# Mask heads +# ---------------------------------------------------------------------------- # + +def mask_rcnn_fcn_head_v1up4convs(model, blob_in, dim_in, spatial_scale): + """v1up design: 4 * (conv 3x3), convT 2x2.""" + return mask_rcnn_fcn_head_v1upXconvs( + model, blob_in, dim_in, spatial_scale, 4 + ) + + +def mask_rcnn_fcn_head_v1up4convs_gn(model, blob_in, dim_in, spatial_scale): + """v1up design: 4 * (conv 3x3), convT 2x2, with GroupNorm""" + return mask_rcnn_fcn_head_v1upXconvs_gn( + model, blob_in, dim_in, spatial_scale, 4 + ) + + +def mask_rcnn_fcn_head_v1up(model, blob_in, dim_in, spatial_scale): + """v1up design: 2 * (conv 3x3), convT 2x2.""" + return mask_rcnn_fcn_head_v1upXconvs( + model, blob_in, dim_in, spatial_scale, 2 + ) + + +def mask_rcnn_fcn_head_v1upXconvs( + model, blob_in, dim_in, spatial_scale, num_convs +): + """v1upXconvs design: X * (conv 3x3), convT 2x2.""" + current = model.RoIFeatureTransform( + blob_in, + blob_out='_[mask]_roi_feat', + blob_rois='mask_rois', + method=cfg.MRCNN.ROI_XFORM_METHOD, + resolution=cfg.MRCNN.ROI_XFORM_RESOLUTION, + sampling_ratio=cfg.MRCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + + dilation = cfg.MRCNN.DILATION + dim_inner = cfg.MRCNN.DIM_REDUCED + + for i in range(num_convs): + current = model.Conv( + current, + '_[mask]_fcn' + str(i + 1), + dim_in, + dim_inner, + kernel=3, + dilation=dilation, + pad=1 * dilation, + stride=1, + weight_init=(cfg.MRCNN.CONV_INIT, {'std': 0.001}), + bias_init=('ConstantFill', {'value': 0.}) + ) + current = model.Relu(current, current) + dim_in = dim_inner + + # upsample layer + model.ConvTranspose( + current, + 'conv5_mask', + dim_inner, + dim_inner, + kernel=2, + pad=0, + stride=2, + weight_init=(cfg.MRCNN.CONV_INIT, {'std': 0.001}), + bias_init=const_fill(0.0) + ) + blob_mask = model.Relu('conv5_mask', 'conv5_mask') + + return blob_mask, dim_inner + + +def mask_rcnn_fcn_head_v1upXconvs_gn( + model, blob_in, dim_in, spatial_scale, num_convs +): + """v1upXconvs design: X * (conv 3x3), convT 2x2, with GroupNorm""" + current = model.RoIFeatureTransform( + blob_in, + blob_out='_mask_roi_feat', + blob_rois='mask_rois', + method=cfg.MRCNN.ROI_XFORM_METHOD, + resolution=cfg.MRCNN.ROI_XFORM_RESOLUTION, + sampling_ratio=cfg.MRCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + + dilation = cfg.MRCNN.DILATION + dim_inner = cfg.MRCNN.DIM_REDUCED + + for i in range(num_convs): + current = model.ConvGN( + current, + '_mask_fcn' + str(i + 1), + dim_in, + dim_inner, + group_gn=get_group_gn(dim_inner), + kernel=3, + pad=1 * dilation, + stride=1, + weight_init=(cfg.MRCNN.CONV_INIT, {'std': 0.001}), + bias_init=('ConstantFill', {'value': 0.}) + ) + current = model.Relu(current, current) + dim_in = dim_inner + + # upsample layer + model.ConvTranspose( + current, + 'conv5_mask', + dim_inner, + dim_inner, + kernel=2, + pad=0, + stride=2, + weight_init=(cfg.MRCNN.CONV_INIT, {'std': 0.001}), + bias_init=const_fill(0.0) + ) + blob_mask = model.Relu('conv5_mask', 'conv5_mask') + + return blob_mask, dim_inner + + +def mask_rcnn_fcn_head_v0upshare(model, blob_in, dim_in, spatial_scale): + """Use a ResNet "conv5" / "stage5" head for mask prediction. Weights and + computation are shared with the conv5 box head. Computation can only be + shared during training, since inference is cascaded. + + v0upshare design: conv5, convT 2x2. + """ + # Since box and mask head are shared, these must match + assert cfg.MRCNN.ROI_XFORM_RESOLUTION == cfg.FAST_RCNN.ROI_XFORM_RESOLUTION + + if model.train: # share computation with bbox head at training time + dim_conv5 = 2048 + blob_conv5 = model.net.SampleAs( + ['res5_2_sum', 'roi_has_mask_int32'], + ['_[mask]_res5_2_sum_sliced'] + ) + else: # re-compute at test time + blob_conv5, dim_conv5 = add_ResNet_roi_conv5_head_for_masks( + model, + blob_in, + dim_in, + spatial_scale + ) + + dim_reduced = cfg.MRCNN.DIM_REDUCED + + blob_mask = model.ConvTranspose( + blob_conv5, + 'conv5_mask', + dim_conv5, + dim_reduced, + kernel=2, + pad=0, + stride=2, + weight_init=(cfg.MRCNN.CONV_INIT, {'std': 0.001}), # std only for gauss + bias_init=const_fill(0.0) + ) + model.Relu('conv5_mask', 'conv5_mask') + + return blob_mask, dim_reduced + + +def mask_rcnn_fcn_head_v0up(model, blob_in, dim_in, spatial_scale): + """v0up design: conv5, deconv 2x2 (no weight sharing with the box head).""" + blob_conv5, dim_conv5 = add_ResNet_roi_conv5_head_for_masks( + model, + blob_in, + dim_in, + spatial_scale + ) + + dim_reduced = cfg.MRCNN.DIM_REDUCED + + model.ConvTranspose( + blob_conv5, + 'conv5_mask', + dim_conv5, + dim_reduced, + kernel=2, + pad=0, + stride=2, + weight_init=('GaussianFill', {'std': 0.001}), + bias_init=const_fill(0.0) + ) + blob_mask = model.Relu('conv5_mask', 'conv5_mask') + + return blob_mask, dim_reduced + + +def add_ResNet_roi_conv5_head_for_masks(model, blob_in, dim_in, spatial_scale): + """Add a ResNet "conv5" / "stage5" head for predicting masks.""" + model.RoIFeatureTransform( + blob_in, + blob_out='_[mask]_pool5', + blob_rois='mask_rois', + method=cfg.MRCNN.ROI_XFORM_METHOD, + resolution=cfg.MRCNN.ROI_XFORM_RESOLUTION, + sampling_ratio=cfg.MRCNN.ROI_XFORM_SAMPLING_RATIO, + spatial_scale=spatial_scale + ) + + dilation = cfg.MRCNN.DILATION + stride_init = int(cfg.MRCNN.ROI_XFORM_RESOLUTION / 7) # by default: 2 + + s, dim_in = ResNet.add_stage( + model, + '_[mask]_res5', + '_[mask]_pool5', + 3, + dim_in, + 2048, + 512, + dilation, + stride_init=stride_init + ) + + return s, 2048 diff --git a/detectron/modeling/model_builder.py b/detectron/modeling/model_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..25ab21770c4bc68108b50f9e69b6d75f2e0a4c23 --- /dev/null +++ b/detectron/modeling/model_builder.py @@ -0,0 +1,675 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Detectron model construction functions. + +Detectron supports a large number of model types. The configuration space is +large. To get a sense, a given model is in element in the cartesian product of: + + - backbone (e.g., VGG16, ResNet, ResNeXt) + - FPN (on or off) + - RPN only (just proposals) + - Fixed proposals for Fast R-CNN, RFCN, Mask R-CNN (with or without keypoints) + - End-to-end model with RPN + Fast R-CNN (i.e., Faster R-CNN), Mask R-CNN, ... + - Different "head" choices for the model + - ... many configuration options ... + +A given model is made by combining many basic components. The result is flexible +though somewhat complex to understand at first. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy +import importlib +import logging + +from caffe2.python import core +from caffe2.python import workspace + +from detectron.core.config import cfg +from detectron.modeling.detector import DetectionModelHelper +from detectron.roi_data.loader import RoIDataLoader +import detectron.modeling.fast_rcnn_heads as fast_rcnn_heads +import detectron.modeling.keypoint_rcnn_heads as keypoint_rcnn_heads +import detectron.modeling.mask_rcnn_heads as mask_rcnn_heads +import detectron.modeling.name_compat as name_compat +import detectron.modeling.optimizer as optim +import detectron.modeling.retinanet_heads as retinanet_heads +import detectron.modeling.rfcn_heads as rfcn_heads +import detectron.modeling.rpn_heads as rpn_heads +import detectron.roi_data.minibatch as roi_data_minibatch +import detectron.utils.c2 as c2_utils + +logger = logging.getLogger(__name__) + + +# ---------------------------------------------------------------------------- # +# Generic recomposable model builders +# +# For example, you can create a Fast R-CNN model with the ResNet-50-C4 backbone +# with the configuration: +# +# MODEL: +# TYPE: generalized_rcnn +# CONV_BODY: ResNet.add_ResNet50_conv4_body +# ROI_HEAD: ResNet.add_ResNet_roi_conv5_head +# ---------------------------------------------------------------------------- # + +def generalized_rcnn(model): + """This model type handles: + - Fast R-CNN + - RPN only (not integrated with Fast R-CNN) + - Faster R-CNN (stagewise training from NIPS paper) + - Faster R-CNN (end-to-end joint training) + - Mask R-CNN (stagewise training from NIPS paper) + - Mask R-CNN (end-to-end joint training) + """ + return build_generic_detection_model( + model, + get_func(cfg.MODEL.CONV_BODY), + add_roi_box_head_func=get_func(cfg.FAST_RCNN.ROI_BOX_HEAD), + add_roi_mask_head_func=get_func(cfg.MRCNN.ROI_MASK_HEAD), + add_roi_keypoint_head_func=get_func(cfg.KRCNN.ROI_KEYPOINTS_HEAD), + freeze_conv_body=cfg.TRAIN.FREEZE_CONV_BODY + ) + + +def rfcn(model): + # TODO(rbg): fold into build_generic_detection_model + return build_generic_rfcn_model(model, get_func(cfg.MODEL.CONV_BODY)) + + +def retinanet(model): + # TODO(rbg): fold into build_generic_detection_model + return build_generic_retinanet_model(model, get_func(cfg.MODEL.CONV_BODY)) + + +# ---------------------------------------------------------------------------- # +# Helper functions for building various re-usable network bits +# ---------------------------------------------------------------------------- # + +def create(model_type_func, train=False, gpu_id=0): + """Generic model creation function that dispatches to specific model + building functions. + + By default, this function will generate a data parallel model configured to + run on cfg.NUM_GPUS devices. However, you can restrict it to build a model + targeted to a specific GPU by specifying gpu_id. This is used by + optimizer.build_data_parallel_model() during test time. + """ + model = DetectionModelHelper( + name=model_type_func, + train=train, + num_classes=cfg.MODEL.NUM_CLASSES, + init_params=train + ) + model.only_build_forward_pass = False + model.target_gpu_id = gpu_id + return get_func(model_type_func)(model) + + +def get_func(func_name): + """Helper to return a function object by name. func_name must identify a + function in this module or the path to a function relative to the base + 'modeling' module. + """ + if func_name == '': + return None + new_func_name = name_compat.get_new_name(func_name) + if new_func_name != func_name: + logger.warn( + 'Remapping old function name: {} -> {}'. + format(func_name, new_func_name) + ) + func_name = new_func_name + try: + parts = func_name.split('.') + # Refers to a function in this module + if len(parts) == 1: + return globals()[parts[0]] + # Otherwise, assume we're referencing a module under modeling + module_name = 'detectron.modeling.' + '.'.join(parts[:-1]) + module = importlib.import_module(module_name) + return getattr(module, parts[-1]) + except Exception: + logger.error('Failed to find function: {}'.format(func_name)) + raise + + +def build_generic_detection_model( + model, + add_conv_body_func, + add_roi_box_head_func=None, + add_roi_mask_head_func=None, + add_roi_keypoint_head_func=None, + freeze_conv_body=False +): + def _single_gpu_build_func(model): + """Build the model on a single GPU. Can be called in a loop over GPUs + with name and device scoping to create a data parallel model. + """ + # Add the conv body (called "backbone architecture" in papers) + # E.g., ResNet-50, ResNet-50-FPN, ResNeXt-101-FPN, etc. + blob_conv, dim_conv, spatial_scale_conv = add_conv_body_func(model) + if freeze_conv_body: + for b in c2_utils.BlobReferenceList(blob_conv): + model.StopGradient(b, b) + + if not model.train: # == inference + # Create a net that can be used to execute the conv body on an image + # (without also executing RPN or any other network heads) + model.conv_body_net = model.net.Clone('conv_body_net') + + head_loss_gradients = { + 'rpn': None, + 'box': None, + 'mask': None, + 'keypoints': None, + } + + if cfg.RPN.RPN_ON: + # Add the RPN head + head_loss_gradients['rpn'] = rpn_heads.add_generic_rpn_outputs( + model, blob_conv, dim_conv, spatial_scale_conv + ) + + if cfg.FPN.FPN_ON: + # After adding the RPN head, restrict FPN blobs and scales to + # those used in the RoI heads + blob_conv, spatial_scale_conv = _narrow_to_fpn_roi_levels( + blob_conv, spatial_scale_conv + ) + + if not cfg.MODEL.RPN_ONLY: + # Add the Fast R-CNN head + head_loss_gradients['box'] = _add_fast_rcnn_head( + model, add_roi_box_head_func, blob_conv, dim_conv, + spatial_scale_conv + ) + + if cfg.MODEL.MASK_ON: + # Add the mask head + head_loss_gradients['mask'] = _add_roi_mask_head( + model, add_roi_mask_head_func, blob_conv, dim_conv, + spatial_scale_conv + ) + + if cfg.MODEL.KEYPOINTS_ON: + # Add the keypoint head + head_loss_gradients['keypoint'] = _add_roi_keypoint_head( + model, add_roi_keypoint_head_func, blob_conv, dim_conv, + spatial_scale_conv + ) + + if model.train: + loss_gradients = {} + for lg in head_loss_gradients.values(): + if lg is not None: + loss_gradients.update(lg) + return loss_gradients + else: + return None + + optim.build_data_parallel_model(model, _single_gpu_build_func) + return model + + +def _narrow_to_fpn_roi_levels(blobs, spatial_scales): + """Return only the blobs and spatial scales that will be used for RoI heads. + Inputs `blobs` and `spatial_scales` may include extra blobs and scales that + are used for RPN proposals, but not for RoI heads. + """ + # Code only supports case when RPN and ROI min levels are the same + assert cfg.FPN.RPN_MIN_LEVEL == cfg.FPN.ROI_MIN_LEVEL + # RPN max level can be >= to ROI max level + assert cfg.FPN.RPN_MAX_LEVEL >= cfg.FPN.ROI_MAX_LEVEL + # FPN RPN max level might be > FPN ROI max level in which case we + # need to discard some leading conv blobs (blobs are ordered from + # max/coarsest level to min/finest level) + num_roi_levels = cfg.FPN.ROI_MAX_LEVEL - cfg.FPN.ROI_MIN_LEVEL + 1 + return blobs[-num_roi_levels:], spatial_scales[-num_roi_levels:] + + +def _add_fast_rcnn_head( + model, add_roi_box_head_func, blob_in, dim_in, spatial_scale_in +): + """Add a Fast R-CNN head to the model.""" + blob_frcn, dim_frcn = add_roi_box_head_func( + model, blob_in, dim_in, spatial_scale_in + ) + fast_rcnn_heads.add_fast_rcnn_outputs(model, blob_frcn, dim_frcn) + if model.train: + loss_gradients = fast_rcnn_heads.add_fast_rcnn_losses(model) + else: + loss_gradients = None + return loss_gradients + + +def _add_roi_mask_head( + model, add_roi_mask_head_func, blob_in, dim_in, spatial_scale_in +): + """Add a mask prediction head to the model.""" + # Capture model graph before adding the mask head + bbox_net = copy.deepcopy(model.net.Proto()) + # Add the mask head + blob_mask_head, dim_mask_head = add_roi_mask_head_func( + model, blob_in, dim_in, spatial_scale_in + ) + # Add the mask output + blob_mask = mask_rcnn_heads.add_mask_rcnn_outputs( + model, blob_mask_head, dim_mask_head + ) + + if not model.train: # == inference + # Inference uses a cascade of box predictions, then mask predictions. + # This requires separate nets for box and mask prediction. + # So we extract the mask prediction net, store it as its own network, + # then restore model.net to be the bbox-only network + model.mask_net, blob_mask = c2_utils.SuffixNet( + 'mask_net', model.net, len(bbox_net.op), blob_mask + ) + model.net._net = bbox_net + loss_gradients = None + else: + loss_gradients = mask_rcnn_heads.add_mask_rcnn_losses(model, blob_mask) + return loss_gradients + + +def _add_roi_keypoint_head( + model, add_roi_keypoint_head_func, blob_in, dim_in, spatial_scale_in +): + """Add a keypoint prediction head to the model.""" + # Capture model graph before adding the mask head + bbox_net = copy.deepcopy(model.net.Proto()) + # Add the keypoint head + blob_keypoint_head, dim_keypoint_head = add_roi_keypoint_head_func( + model, blob_in, dim_in, spatial_scale_in + ) + # Add the keypoint output + blob_keypoint = keypoint_rcnn_heads.add_keypoint_outputs( + model, blob_keypoint_head, dim_keypoint_head + ) + + if not model.train: # == inference + # Inference uses a cascade of box predictions, then keypoint predictions + # This requires separate nets for box and keypoint prediction. + # So we extract the keypoint prediction net, store it as its own + # network, then restore model.net to be the bbox-only network + model.keypoint_net, keypoint_blob_out = c2_utils.SuffixNet( + 'keypoint_net', model.net, len(bbox_net.op), blob_keypoint + ) + model.net._net = bbox_net + loss_gradients = None + else: + loss_gradients = keypoint_rcnn_heads.add_keypoint_losses(model) + return loss_gradients + + +def build_generic_rfcn_model(model, add_conv_body_func, dim_reduce=None): + # TODO(rbg): fold this function into build_generic_detection_model + def _single_gpu_build_func(model): + """Builds the model on a single GPU. Can be called in a loop over GPUs + with name and device scoping to create a data parallel model.""" + blob, dim, spatial_scale = add_conv_body_func(model) + if not model.train: + model.conv_body_net = model.net.Clone('conv_body_net') + rfcn_heads.add_rfcn_outputs(model, blob, dim, dim_reduce, spatial_scale) + if model.train: + loss_gradients = fast_rcnn_heads.add_fast_rcnn_losses(model) + return loss_gradients if model.train else None + + optim.build_data_parallel_model(model, _single_gpu_build_func) + return model + + +def build_generic_retinanet_model( + model, add_conv_body_func, freeze_conv_body=False +): + # TODO(rbg): fold this function into build_generic_detection_model + def _single_gpu_build_func(model): + """Builds the model on a single GPU. Can be called in a loop over GPUs + with name and device scoping to create a data parallel model.""" + blobs, dim, spatial_scales = add_conv_body_func(model) + if not model.train: + model.conv_body_net = model.net.Clone('conv_body_net') + retinanet_heads.add_fpn_retinanet_outputs( + model, blobs, dim, spatial_scales + ) + if model.train: + loss_gradients = retinanet_heads.add_fpn_retinanet_losses( + model + ) + return loss_gradients if model.train else None + + optim.build_data_parallel_model(model, _single_gpu_build_func) + return model + + +# ---------------------------------------------------------------------------- # +# Network inputs +# ---------------------------------------------------------------------------- # + +def add_training_inputs(model, roidb=None): + """Create network input ops and blobs used for training. To be called + *after* model_builder.create(). + """ + # Implementation notes: + # Typically, one would create the input ops and then the rest of the net. + # However, creating the input ops depends on loading the dataset, which + # can take a few minutes for COCO. + # We prefer to avoid waiting so debugging can fail fast. + # Thus, we create the net *without input ops* prior to loading the + # dataset, and then add the input ops after loading the dataset. + # Since we defer input op creation, we need to do a little bit of surgery + # to place the input ops at the start of the network op list. + assert model.train, 'Training inputs can only be added to a trainable model' + if roidb is not None: + # To make debugging easier you can set cfg.DATA_LOADER.NUM_THREADS = 1 + model.roi_data_loader = RoIDataLoader( + roidb, + num_loaders=cfg.DATA_LOADER.NUM_THREADS, + minibatch_queue_size=cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE, + blobs_queue_capacity=cfg.DATA_LOADER.BLOBS_QUEUE_CAPACITY + ) + orig_num_op = len(model.net._net.op) + blob_names = roi_data_minibatch.get_minibatch_blob_names(is_training=True) + for gpu_id in range(cfg.NUM_GPUS): + with c2_utils.NamedCudaScope(gpu_id): + for blob_name in blob_names: + workspace.CreateBlob(core.ScopedName(blob_name)) + model.net.DequeueBlobs( + model.roi_data_loader._blobs_queue_name, blob_names + ) + # A little op surgery to move input ops to the start of the net + diff = len(model.net._net.op) - orig_num_op + new_op = model.net._net.op[-diff:] + model.net._net.op[:-diff] + del model.net._net.op[:] + model.net._net.op.extend(new_op) + + +def add_inference_inputs(model): + """Create network input blobs used for inference.""" + + def create_input_blobs_for_net(net_def): + for op in net_def.op: + for blob_in in op.input: + if not workspace.HasBlob(blob_in): + workspace.CreateBlob(blob_in) + + create_input_blobs_for_net(model.net.Proto()) + if cfg.MODEL.MASK_ON: + create_input_blobs_for_net(model.mask_net.Proto()) + if cfg.MODEL.KEYPOINTS_ON: + create_input_blobs_for_net(model.keypoint_net.Proto()) + + +# ---------------------------------------------------------------------------- # +# ********************** DEPRECATED FUNCTIONALITY BELOW ********************** # +# ---------------------------------------------------------------------------- # + +# ---------------------------------------------------------------------------- # +# Hardcoded functions to create various types of common models +# +# *** This type of model definition is deprecated *** +# *** Use the generic composable versions instead *** +# +# ---------------------------------------------------------------------------- # + +import detectron.modeling.ResNet as ResNet +import detectron.modeling.VGG16 as VGG16 +import detectron.modeling.VGG_CNN_M_1024 as VGG_CNN_M_1024 + + +def fast_rcnn(model): + logger.warn('Deprecated: use `MODEL.TYPE: generalized_rcnn`.') + return generalized_rcnn(model) + + +def mask_rcnn(model): + logger.warn( + 'Deprecated: use `MODEL.TYPE: generalized_rcnn` with ' + '`MODEL.MASK_ON: True`' + ) + return generalized_rcnn(model) + + +def keypoint_rcnn(model): + logger.warn( + 'Deprecated: use `MODEL.TYPE: generalized_rcnn` with ' + '`MODEL.KEYPOINTS_ON: True`' + ) + return generalized_rcnn(model) + + +def mask_and_keypoint_rcnn(model): + logger.warn( + 'Deprecated: use `MODEL.TYPE: generalized_rcnn` with ' + '`MODEL.MASK_ON: True and ``MODEL.KEYPOINTS_ON: True`' + ) + return generalized_rcnn(model) + + +def rpn(model): + logger.warn( + 'Deprecated: use `MODEL.TYPE: generalized_rcnn` with ' + '`MODEL.RPN_ONLY: True`' + ) + return generalized_rcnn(model) + + +def fpn_rpn(model): + logger.warn( + 'Deprecated: use `MODEL.TYPE: generalized_rcnn` with ' + '`MODEL.RPN_ONLY: True` and FPN enabled via configs' + ) + return generalized_rcnn(model) + + +def faster_rcnn(model): + logger.warn( + 'Deprecated: use `MODEL.TYPE: generalized_rcnn` with ' + '`MODEL.FASTER_RCNN: True`' + ) + return generalized_rcnn(model) + + +def fast_rcnn_frozen_features(model): + logger.warn('Deprecated: use `TRAIN.FREEZE_CONV_BODY: True` instead') + return build_generic_detection_model( + model, + get_func(cfg.MODEL.CONV_BODY), + add_roi_box_head_func=get_func(cfg.FAST_RCNN.ROI_BOX_HEAD), + freeze_conv_body=True + ) + + +def rpn_frozen_features(model): + logger.warn('Deprecated: use `TRAIN.FREEZE_CONV_BODY: True` instead') + return build_generic_detection_model( + model, get_func(cfg.MODEL.CONV_BODY), freeze_conv_body=True + ) + + +def fpn_rpn_frozen_features(model): + logger.warn('Deprecated: use `TRAIN.FREEZE_CONV_BODY: True` instead') + return build_generic_detection_model( + model, get_func(cfg.MODEL.CONV_BODY), freeze_conv_body=True + ) + + +def mask_rcnn_frozen_features(model): + logger.warn('Deprecated: use `TRAIN.FREEZE_CONV_BODY: True` instead') + return build_generic_detection_model( + model, + get_func(cfg.MODEL.CONV_BODY), + add_roi_box_head_func=get_func(cfg.FAST_RCNN.ROI_BOX_HEAD), + add_roi_mask_head_func=get_func(cfg.MRCNN.ROI_MASK_HEAD), + freeze_conv_body=True + ) + + +def keypoint_rcnn_frozen_features(model): + logger.warn('Deprecated: use `TRAIN.FREEZE_CONV_BODY: True` instead') + return build_generic_detection_model( + model, + get_func(cfg.MODEL.CONV_BODY), + add_roi_box_head_func=get_func(cfg.FAST_RCNN.ROI_BOX_HEAD), + add_roi_keypoint_head_func=get_func(cfg.KRCNN.ROI_KEYPOINTS_HEAD), + freeze_conv_body=True + ) + + +# ---------------------------------------------------------------------------- # +# Fast R-CNN models +# ---------------------------------------------------------------------------- # + + +def VGG_CNN_M_1024_fast_rcnn(model): + return build_generic_detection_model( + model, VGG_CNN_M_1024.add_VGG_CNN_M_1024_conv5_body, + VGG_CNN_M_1024.add_VGG_CNN_M_1024_roi_fc_head + ) + + +def VGG16_fast_rcnn(model): + return build_generic_detection_model( + model, VGG16.add_VGG16_conv5_body, VGG16.add_VGG16_roi_fc_head + ) + + +def ResNet50_fast_rcnn(model): + return build_generic_detection_model( + model, ResNet.add_ResNet50_conv4_body, ResNet.add_ResNet_roi_conv5_head + ) + + +def ResNet101_fast_rcnn(model): + return build_generic_detection_model( + model, ResNet.add_ResNet101_conv4_body, ResNet.add_ResNet_roi_conv5_head + ) + + +def ResNet50_fast_rcnn_frozen_features(model): + return build_generic_detection_model( + model, + ResNet.add_ResNet50_conv4_body, + ResNet.add_ResNet_roi_conv5_head, + freeze_conv_body=True + ) + + +def ResNet101_fast_rcnn_frozen_features(model): + return build_generic_detection_model( + model, + ResNet.add_ResNet101_conv4_body, + ResNet.add_ResNet_roi_conv5_head, + freeze_conv_body=True + ) + + +# ---------------------------------------------------------------------------- # +# RPN-only models +# ---------------------------------------------------------------------------- # + + +def VGG_CNN_M_1024_rpn(model): + return build_generic_detection_model( + model, VGG_CNN_M_1024.add_VGG_CNN_M_1024_conv5_body + ) + + +def VGG16_rpn(model): + return build_generic_detection_model(model, VGG16.add_VGG16_conv5_body) + + +def ResNet50_rpn_conv4(model): + return build_generic_detection_model(model, ResNet.add_ResNet50_conv4_body) + + +def ResNet101_rpn_conv4(model): + return build_generic_detection_model(model, ResNet.add_ResNet101_conv4_body) + + +def VGG_CNN_M_1024_rpn_frozen_features(model): + return build_generic_detection_model( + model, + VGG_CNN_M_1024.add_VGG_CNN_M_1024_conv5_body, + freeze_conv_body=True + ) + + +def VGG16_rpn_frozen_features(model): + return build_generic_detection_model( + model, VGG16.add_VGG16_conv5_body, freeze_conv_body=True + ) + + +def ResNet50_rpn_conv4_frozen_features(model): + return build_generic_detection_model( + model, ResNet.add_ResNet50_conv4_body, freeze_conv_body=True + ) + + +def ResNet101_rpn_conv4_frozen_features(model): + return build_generic_detection_model( + model, ResNet.add_ResNet101_conv4_body, freeze_conv_body=True + ) + + +# ---------------------------------------------------------------------------- # +# Faster R-CNN models +# ---------------------------------------------------------------------------- # + + +def VGG16_faster_rcnn(model): + assert cfg.MODEL.FASTER_RCNN + return build_generic_detection_model( + model, VGG16.add_VGG16_conv5_body, VGG16.add_VGG16_roi_fc_head + ) + + +def ResNet50_faster_rcnn(model): + assert cfg.MODEL.FASTER_RCNN + return build_generic_detection_model( + model, ResNet.add_ResNet50_conv4_body, ResNet.add_ResNet_roi_conv5_head + ) + + +def ResNet101_faster_rcnn(model): + assert cfg.MODEL.FASTER_RCNN + return build_generic_detection_model( + model, ResNet.add_ResNet101_conv4_body, ResNet.add_ResNet_roi_conv5_head + ) + + +# ---------------------------------------------------------------------------- # +# R-FCN models +# ---------------------------------------------------------------------------- # + + +def ResNet50_rfcn(model): + return build_generic_rfcn_model( + model, ResNet.add_ResNet50_conv5_body, dim_reduce=1024 + ) + + +def ResNet101_rfcn(model): + return build_generic_rfcn_model( + model, ResNet.add_ResNet101_conv5_body, dim_reduce=1024 + ) diff --git a/detectron/modeling/name_compat.py b/detectron/modeling/name_compat.py new file mode 100644 index 0000000000000000000000000000000000000000..7899ea6e04474d87d8adef8a2e5baf3c40dda543 --- /dev/null +++ b/detectron/modeling/name_compat.py @@ -0,0 +1,62 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Handle mapping from old network building function names to new names. + +Flexible network configuration is achieved by specifying the function name that +builds a network module (e.g., the name of the conv backbone or the mask roi +head). However we may wish to change names over time without breaking previous +config files. This module provides backwards naming compatibility by providing +a mapping from the old name to the new name. + +When renaming functions, it's generally a good idea to codemod existing yaml +config files. An easy way to batch edit, by example, is a shell command like + +$ find . -name "*.yaml" -exec sed -i -e \ + 's/head_builder\.add_roi_2mlp_head/fast_rcnn_heads.add_roi_2mlp_head/g' {} \; + +to perform the renaming: + head_builder.add_roi_2mlp_head => fast_rcnn_heads.add_roi_2mlp_head +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + + +_RENAME = { + # Removed "ResNet_" from the name because it wasn't relevent + 'mask_rcnn_heads.ResNet_mask_rcnn_fcn_head_v1up4convs': + 'mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs', + # Removed "ResNet_" from the name because it wasn't relevent + 'mask_rcnn_heads.ResNet_mask_rcnn_fcn_head_v1up': + 'mask_rcnn_heads.mask_rcnn_fcn_head_v1up', + # Removed "ResNet_" from the name because it wasn't relevent + 'mask_rcnn_heads.ResNet_mask_rcnn_fcn_head_v0upshare': + 'mask_rcnn_heads.mask_rcnn_fcn_head_v0upshare', + # Removed "ResNet_" from the name because it wasn't relevent + 'mask_rcnn_heads.ResNet_mask_rcnn_fcn_head_v0up': + 'mask_rcnn_heads.mask_rcnn_fcn_head_v0up', + # Removed head_builder module in favor of the more specific fast_rcnn name + 'head_builder.add_roi_2mlp_head': + 'fast_rcnn_heads.add_roi_2mlp_head', +} + + +def get_new_name(func_name): + if func_name in _RENAME: + func_name = _RENAME[func_name] + return func_name diff --git a/detectron/modeling/optimizer.py b/detectron/modeling/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..4702428910561440c2a238da2945c5c835ca3841 --- /dev/null +++ b/detectron/modeling/optimizer.py @@ -0,0 +1,130 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Optimization operator graph construction.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import logging + +from caffe2.python import muji + +from detectron.core.config import cfg +import detectron.utils.c2 as c2_utils + +logger = logging.getLogger(__name__) + + +def build_data_parallel_model(model, single_gpu_build_func): + """Build a data parallel model given a function that builds the model on a + single GPU. + """ + if model.only_build_forward_pass: + single_gpu_build_func(model) + elif model.train: + all_loss_gradients = _build_forward_graph(model, single_gpu_build_func) + # Add backward pass on all GPUs + model.AddGradientOperators(all_loss_gradients) + if cfg.NUM_GPUS > 1: + _add_allreduce_graph(model) + for gpu_id in range(cfg.NUM_GPUS): + # After allreduce, all GPUs perform SGD updates on their identical + # params and gradients in parallel + with c2_utils.NamedCudaScope(gpu_id): + add_single_gpu_param_update_ops(model, gpu_id) + else: + # Test-time network operates on single GPU + # Test-time parallelism is implemented through multiprocessing + with c2_utils.NamedCudaScope(model.target_gpu_id): + single_gpu_build_func(model) + + +def _build_forward_graph(model, single_gpu_build_func): + """Construct the forward graph on each GPU.""" + all_loss_gradients = {} # Will include loss gradients from all GPUs + # Build the model on each GPU with correct name and device scoping + for gpu_id in range(cfg.NUM_GPUS): + with c2_utils.NamedCudaScope(gpu_id): + all_loss_gradients.update(single_gpu_build_func(model)) + return all_loss_gradients + + +def _add_allreduce_graph(model): + """Construct the graph that performs Allreduce on the gradients.""" + # Need to all-reduce the per-GPU gradients if training with more than 1 GPU + all_params = model.TrainableParams() + assert len(all_params) % cfg.NUM_GPUS == 0 + # The model parameters are replicated on each GPU, get the number + # distinct parameter blobs (i.e., the number of parameter blobs on + # each GPU) + params_per_gpu = int(len(all_params) / cfg.NUM_GPUS) + with c2_utils.CudaScope(0): + # Iterate over distinct parameter blobs + for i in range(params_per_gpu): + # Gradients from all GPUs for this parameter blob + gradients = [ + model.param_to_grad[p] for p in all_params[i::params_per_gpu] + ] + if len(gradients) > 0: + if cfg.USE_NCCL: + model.net.NCCLAllreduce(gradients, gradients) + else: + muji.Allreduce(model.net, gradients, reduced_affix='') + + +def add_single_gpu_param_update_ops(model, gpu_id): + # Learning rate of 0 is a dummy value to be set properly at the + # start of training + lr = model.param_init_net.ConstantFill( + [], 'lr', shape=[1], value=0.0 + ) + one = model.param_init_net.ConstantFill( + [], 'one', shape=[1], value=1.0 + ) + wd = model.param_init_net.ConstantFill( + [], 'wd', shape=[1], value=cfg.SOLVER.WEIGHT_DECAY + ) + # weight decay of GroupNorm's parameters + wd_gn = model.param_init_net.ConstantFill( + [], 'wd_gn', shape=[1], value=cfg.SOLVER.WEIGHT_DECAY_GN + ) + for param in model.TrainableParams(gpu_id=gpu_id): + logger.debug('param ' + str(param) + ' will be updated') + param_grad = model.param_to_grad[param] + # Initialize momentum vector + param_momentum = model.param_init_net.ConstantFill( + [param], param + '_momentum', value=0.0 + ) + if param in model.biases: + # Special treatment for biases (mainly to match historical impl. + # details): + # (1) Do not apply weight decay + # (2) Use a 2x higher learning rate + model.Scale(param_grad, param_grad, scale=2.0) + elif param in model.gn_params: + # Special treatment for GroupNorm's parameters + model.WeightedSum([param_grad, one, param, wd_gn], param_grad) + elif cfg.SOLVER.WEIGHT_DECAY > 0: + # Apply weight decay to non-bias weights + model.WeightedSum([param_grad, one, param, wd], param_grad) + # Update param_grad and param_momentum in place + model.net.MomentumSGDUpdate( + [param_grad, param_momentum, lr, param], + [param_grad, param_momentum, param], + momentum=cfg.SOLVER.MOMENTUM + ) diff --git a/detectron/modeling/retinanet_heads.py b/detectron/modeling/retinanet_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..56f09f1c625019f7382489919aae2e8de14a8beb --- /dev/null +++ b/detectron/modeling/retinanet_heads.py @@ -0,0 +1,311 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""RetinaNet model heads and losses. See: https://arxiv.org/abs/1708.02002.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + +from detectron.core.config import cfg +import detectron.utils.blob as blob_utils + + +def get_retinanet_bias_init(model): + """Initialize the biases for the conv ops that predict class probabilities. + Initialization is performed such that at the start of training, all + locations are predicted to be background with high probability + (e.g., ~0.99 = 1 - cfg.RETINANET.PRIOR_PROB). See the Focal Loss paper for + details. + """ + prior_prob = cfg.RETINANET.PRIOR_PROB + scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE + aspect_ratios = len(cfg.RETINANET.ASPECT_RATIOS) + if cfg.RETINANET.SOFTMAX: + # Multiclass softmax case + bias = np.zeros((model.num_classes, 1), dtype=np.float32) + bias[0] = np.log( + (model.num_classes - 1) * (1 - prior_prob) / (prior_prob) + ) + bias = np.vstack( + [bias for _ in range(scales_per_octave * aspect_ratios)] + ) + bias_init = ( + 'GivenTensorFill', { + 'values': bias.astype(dtype=np.float32) + } + ) + else: + # Per-class sigmoid (binary classification) case + bias_init = ( + 'ConstantFill', { + 'value': -np.log((1 - prior_prob) / prior_prob) + } + ) + return bias_init + + +def add_fpn_retinanet_outputs(model, blobs_in, dim_in, spatial_scales): + """RetinaNet head. For classification and box regression, we can chose to + have the same conv tower or a separate tower. "bl_feat_list" stores the list + of feature blobs for bbox prediction. These blobs can be shared cls feature + blobs if we share the tower or else are independent blobs. + """ + dim_out = dim_in + k_max = cfg.FPN.RPN_MAX_LEVEL # coarsest level of pyramid + k_min = cfg.FPN.RPN_MIN_LEVEL # finest level of pyramid + A = len(cfg.RETINANET.ASPECT_RATIOS) * cfg.RETINANET.SCALES_PER_OCTAVE + + # compute init for bias + bias_init = get_retinanet_bias_init(model) + + assert len(blobs_in) == k_max - k_min + 1 + bbox_feat_list = [] + cls_pred_dim = ( + model.num_classes if cfg.RETINANET.SOFTMAX else (model.num_classes - 1) + ) + # unpacked bbox feature and add prediction layers + bbox_regr_dim = ( + 4 * (model.num_classes - 1) if cfg.RETINANET.CLASS_SPECIFIC_BBOX else 4 + ) + + # ========================================================================== + # classification tower with logits and prob prediction + # ========================================================================== + for lvl in range(k_min, k_max + 1): + bl_in = blobs_in[k_max - lvl] # blobs_in is in reversed order + # classification tower stack convolution starts + for nconv in range(cfg.RETINANET.NUM_CONVS): + suffix = 'n{}_fpn{}'.format(nconv, lvl) + dim_in, dim_out = dim_in, dim_in + if lvl == k_min: + bl_out = model.Conv( + bl_in, + 'retnet_cls_conv_' + suffix, + dim_in, + dim_out, + 3, + stride=1, + pad=1, + weight_init=('GaussianFill', { + 'std': 0.01 + }), + bias_init=('ConstantFill', { + 'value': 0. + }) + ) + else: + bl_out = model.ConvShared( + bl_in, + 'retnet_cls_conv_' + suffix, + dim_in, + dim_out, + 3, + stride=1, + pad=1, + weight='retnet_cls_conv_n{}_fpn{}_w'.format(nconv, k_min), + bias='retnet_cls_conv_n{}_fpn{}_b'.format(nconv, k_min) + ) + bl_in = model.Relu(bl_out, bl_out) + bl_feat = bl_in + # cls tower stack convolution ends. Add the logits layer now + if lvl == k_min: + retnet_cls_pred = model.Conv( + bl_feat, + 'retnet_cls_pred_fpn{}'.format(lvl), + dim_in, + cls_pred_dim * A, + 3, + pad=1, + stride=1, + weight_init=('GaussianFill', { + 'std': 0.01 + }), + bias_init=bias_init + ) + else: + retnet_cls_pred = model.ConvShared( + bl_feat, + 'retnet_cls_pred_fpn{}'.format(lvl), + dim_in, + cls_pred_dim * A, + 3, + pad=1, + stride=1, + weight='retnet_cls_pred_fpn{}_w'.format(k_min), + bias='retnet_cls_pred_fpn{}_b'.format(k_min) + ) + if not model.train: + if cfg.RETINANET.SOFTMAX: + model.net.GroupSpatialSoftmax( + retnet_cls_pred, + 'retnet_cls_prob_fpn{}'.format(lvl), + num_classes=cls_pred_dim + ) + else: + model.net.Sigmoid( + retnet_cls_pred, 'retnet_cls_prob_fpn{}'.format(lvl) + ) + if cfg.RETINANET.SHARE_CLS_BBOX_TOWER: + bbox_feat_list.append(bl_feat) + + # ========================================================================== + # bbox tower if not sharing features with the classification tower with + # logits and prob prediction + # ========================================================================== + if not cfg.RETINANET.SHARE_CLS_BBOX_TOWER: + for lvl in range(k_min, k_max + 1): + bl_in = blobs_in[k_max - lvl] # blobs_in is in reversed order + for nconv in range(cfg.RETINANET.NUM_CONVS): + suffix = 'n{}_fpn{}'.format(nconv, lvl) + dim_in, dim_out = dim_in, dim_in + if lvl == k_min: + bl_out = model.Conv( + bl_in, + 'retnet_bbox_conv_' + suffix, + dim_in, + dim_out, + 3, + stride=1, + pad=1, + weight_init=('GaussianFill', { + 'std': 0.01 + }), + bias_init=('ConstantFill', { + 'value': 0. + }) + ) + else: + bl_out = model.ConvShared( + bl_in, + 'retnet_bbox_conv_' + suffix, + dim_in, + dim_out, + 3, + stride=1, + pad=1, + weight='retnet_bbox_conv_n{}_fpn{}_w'.format( + nconv, k_min + ), + bias='retnet_bbox_conv_n{}_fpn{}_b'.format( + nconv, k_min + ) + ) + bl_in = model.Relu(bl_out, bl_out) + # Add octave scales and aspect ratio + # At least 1 convolution for dealing different aspect ratios + bl_feat = bl_in + bbox_feat_list.append(bl_feat) + # Depending on the features [shared/separate] for bbox, add prediction layer + for i, lvl in enumerate(range(k_min, k_max + 1)): + bbox_pred = 'retnet_bbox_pred_fpn{}'.format(lvl) + bl_feat = bbox_feat_list[i] + if lvl == k_min: + model.Conv( + bl_feat, + bbox_pred, + dim_in, + bbox_regr_dim * A, + 3, + pad=1, + stride=1, + weight_init=('GaussianFill', { + 'std': 0.01 + }), + bias_init=('ConstantFill', { + 'value': 0. + }) + ) + else: + model.ConvShared( + bl_feat, + bbox_pred, + dim_in, + bbox_regr_dim * A, + 3, + pad=1, + stride=1, + weight='retnet_bbox_pred_fpn{}_w'.format(k_min), + bias='retnet_bbox_pred_fpn{}_b'.format(k_min) + ) + + +def add_fpn_retinanet_losses(model): + loss_gradients = {} + gradients, losses = [], [] + + k_max = cfg.FPN.RPN_MAX_LEVEL # coarsest level of pyramid + k_min = cfg.FPN.RPN_MIN_LEVEL # finest level of pyramid + + model.AddMetrics(['retnet_fg_num', 'retnet_bg_num']) + # ========================================================================== + # bbox regression loss - SelectSmoothL1Loss for multiple anchors at a location + # ========================================================================== + for lvl in range(k_min, k_max + 1): + suffix = 'fpn{}'.format(lvl) + bbox_loss = model.net.SelectSmoothL1Loss( + [ + 'retnet_bbox_pred_' + suffix, + 'retnet_roi_bbox_targets_' + suffix, + 'retnet_roi_fg_bbox_locs_' + suffix, 'retnet_fg_num' + ], + 'retnet_loss_bbox_' + suffix, + beta=cfg.RETINANET.BBOX_REG_BETA, + scale=model.GetLossScale() * cfg.RETINANET.BBOX_REG_WEIGHT + ) + gradients.append(bbox_loss) + losses.append('retnet_loss_bbox_' + suffix) + + # ========================================================================== + # cls loss - depends on softmax/sigmoid outputs + # ========================================================================== + for lvl in range(k_min, k_max + 1): + suffix = 'fpn{}'.format(lvl) + cls_lvl_logits = 'retnet_cls_pred_' + suffix + if not cfg.RETINANET.SOFTMAX: + cls_focal_loss = model.net.SigmoidFocalLoss( + [ + cls_lvl_logits, 'retnet_cls_labels_' + suffix, + 'retnet_fg_num' + ], + ['fl_{}'.format(suffix)], + gamma=cfg.RETINANET.LOSS_GAMMA, + alpha=cfg.RETINANET.LOSS_ALPHA, + scale=model.GetLossScale(), + num_classes=model.num_classes - 1 + ) + gradients.append(cls_focal_loss) + losses.append('fl_{}'.format(suffix)) + else: + cls_focal_loss, gated_prob = model.net.SoftmaxFocalLoss( + [ + cls_lvl_logits, 'retnet_cls_labels_' + suffix, + 'retnet_fg_num' + ], + ['fl_{}'.format(suffix), 'retnet_prob_{}'.format(suffix)], + gamma=cfg.RETINANET.LOSS_GAMMA, + alpha=cfg.RETINANET.LOSS_ALPHA, + scale=model.GetLossScale(), + num_classes=model.num_classes + ) + gradients.append(cls_focal_loss) + losses.append('fl_{}'.format(suffix)) + + loss_gradients.update(blob_utils.get_loss_gradients(model, gradients)) + model.AddLosses(losses) + return loss_gradients diff --git a/detectron/modeling/rfcn_heads.py b/detectron/modeling/rfcn_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..5b54addfcba549cae3b0e744e81237753753d7cb --- /dev/null +++ b/detectron/modeling/rfcn_heads.py @@ -0,0 +1,99 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.core.config import cfg +from detectron.utils.c2 import const_fill +from detectron.utils.c2 import gauss_fill + + +# ---------------------------------------------------------------------------- # +# R-FCN outputs and losses +# ---------------------------------------------------------------------------- # + +def add_rfcn_outputs(model, blob_in, dim_in, dim_reduce, spatial_scale): + if dim_reduce is not None: + # Optional dim reduction + blob_in = model.Conv( + blob_in, + 'conv_dim_reduce', + dim_in, + dim_reduce, + kernel=1, + pad=0, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + blob_in = model.Relu(blob_in, blob_in) + dim_in = dim_reduce + # Classification conv + model.Conv( + blob_in, + 'conv_cls', + dim_in, + model.num_classes * cfg.RFCN.PS_GRID_SIZE**2, + kernel=1, + pad=0, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + # Bounding-box regression conv + num_bbox_reg_classes = ( + 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else model.num_classes + ) + model.Conv( + blob_in, + 'conv_bbox_pred', + dim_in, + 4 * num_bbox_reg_classes * cfg.RFCN.PS_GRID_SIZE**2, + kernel=1, + pad=0, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + # Classification PS RoI pooling + model.net.PSRoIPool( + ['conv_cls', 'rois'], ['psroipooled_cls', '_mapping_channel_cls'], + group_size=cfg.RFCN.PS_GRID_SIZE, + output_dim=model.num_classes, + spatial_scale=spatial_scale + ) + model.AveragePool( + 'psroipooled_cls', 'cls_score_4d', kernel=cfg.RFCN.PS_GRID_SIZE + ) + model.net.Reshape( + 'cls_score_4d', ['cls_score', '_cls_scores_shape'], + shape=(-1, cfg.MODEL.NUM_CLASSES) + ) + if not model.train: + model.Softmax('cls_score', 'cls_prob', engine='CUDNN') + # Bbox regression PS RoI pooling + model.net.PSRoIPool( + ['conv_bbox_pred', 'rois'], + ['psroipooled_bbox', '_mapping_channel_bbox'], + group_size=cfg.RFCN.PS_GRID_SIZE, + output_dim=4 * num_bbox_reg_classes, + spatial_scale=spatial_scale + ) + model.AveragePool( + 'psroipooled_bbox', 'bbox_pred', kernel=cfg.RFCN.PS_GRID_SIZE + ) diff --git a/detectron/modeling/rpn_heads.py b/detectron/modeling/rpn_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..1f0a9b5ae7e3af5026e18c11b03a5c4c0e941f2c --- /dev/null +++ b/detectron/modeling/rpn_heads.py @@ -0,0 +1,154 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from detectron.core.config import cfg +from detectron.modeling.generate_anchors import generate_anchors +from detectron.utils.c2 import const_fill +from detectron.utils.c2 import gauss_fill +import detectron.modeling.FPN as FPN +import detectron.utils.blob as blob_utils + + +# ---------------------------------------------------------------------------- # +# RPN and Faster R-CNN outputs and losses +# ---------------------------------------------------------------------------- # + +def add_generic_rpn_outputs(model, blob_in, dim_in, spatial_scale_in): + """Add RPN outputs (objectness classification and bounding box regression) + to an RPN model. Abstracts away the use of FPN. + """ + loss_gradients = None + if cfg.FPN.FPN_ON: + # Delegate to the FPN module + FPN.add_fpn_rpn_outputs(model, blob_in, dim_in, spatial_scale_in) + if cfg.MODEL.FASTER_RCNN: + # CollectAndDistributeFpnRpnProposals also labels proposals when in + # training mode + model.CollectAndDistributeFpnRpnProposals() + if model.train: + loss_gradients = FPN.add_fpn_rpn_losses(model) + else: + # Not using FPN, add RPN to a single scale + add_single_scale_rpn_outputs(model, blob_in, dim_in, spatial_scale_in) + if model.train: + loss_gradients = add_single_scale_rpn_losses(model) + return loss_gradients + + +def add_single_scale_rpn_outputs(model, blob_in, dim_in, spatial_scale): + """Add RPN outputs to a single scale model (i.e., no FPN).""" + anchors = generate_anchors( + stride=1. / spatial_scale, + sizes=cfg.RPN.SIZES, + aspect_ratios=cfg.RPN.ASPECT_RATIOS + ) + num_anchors = anchors.shape[0] + dim_out = dim_in + # RPN hidden representation + model.Conv( + blob_in, + 'conv_rpn', + dim_in, + dim_out, + kernel=3, + pad=1, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + model.Relu('conv_rpn', 'conv_rpn') + # Proposal classification scores + model.Conv( + 'conv_rpn', + 'rpn_cls_logits', + dim_in, + num_anchors, + kernel=1, + pad=0, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + # Proposal bbox regression deltas + model.Conv( + 'conv_rpn', + 'rpn_bbox_pred', + dim_in, + 4 * num_anchors, + kernel=1, + pad=0, + stride=1, + weight_init=gauss_fill(0.01), + bias_init=const_fill(0.0) + ) + + if not model.train or cfg.MODEL.FASTER_RCNN: + # Proposals are needed during: + # 1) inference (== not model.train) for RPN only and Faster R-CNN + # OR + # 2) training for Faster R-CNN + # Otherwise (== training for RPN only), proposals are not needed + model.net.Sigmoid('rpn_cls_logits', 'rpn_cls_probs') + model.GenerateProposals( + ['rpn_cls_probs', 'rpn_bbox_pred', 'im_info'], + ['rpn_rois', 'rpn_roi_probs'], + anchors=anchors, + spatial_scale=spatial_scale + ) + + if cfg.MODEL.FASTER_RCNN: + if model.train: + # Add op that generates training labels for in-network RPN proposals + model.GenerateProposalLabels(['rpn_rois', 'roidb', 'im_info']) + else: + # Alias rois to rpn_rois for inference + model.net.Alias('rpn_rois', 'rois') + + +def add_single_scale_rpn_losses(model): + """Add losses for a single scale RPN model (i.e., no FPN).""" + # Spatially narrow the full-sized RPN label arrays to match the feature map + # shape + model.net.SpatialNarrowAs( + ['rpn_labels_int32_wide', 'rpn_cls_logits'], 'rpn_labels_int32' + ) + for key in ('targets', 'inside_weights', 'outside_weights'): + model.net.SpatialNarrowAs( + ['rpn_bbox_' + key + '_wide', 'rpn_bbox_pred'], 'rpn_bbox_' + key + ) + loss_rpn_cls = model.net.SigmoidCrossEntropyLoss( + ['rpn_cls_logits', 'rpn_labels_int32'], + 'loss_rpn_cls', + scale=model.GetLossScale() + ) + loss_rpn_bbox = model.net.SmoothL1Loss( + [ + 'rpn_bbox_pred', 'rpn_bbox_targets', 'rpn_bbox_inside_weights', + 'rpn_bbox_outside_weights' + ], + 'loss_rpn_bbox', + beta=1. / 9., + scale=model.GetLossScale() + ) + loss_gradients = blob_utils.get_loss_gradients( + model, [loss_rpn_cls, loss_rpn_bbox] + ) + model.AddLosses(['loss_rpn_cls', 'loss_rpn_bbox']) + return loss_gradients diff --git a/detectron/ops/__init__.py b/detectron/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py b/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py new file mode 100644 index 0000000000000000000000000000000000000000..76c5ed3bdceef3ad7ec7386121544bf719c48869 --- /dev/null +++ b/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py @@ -0,0 +1,113 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + +from detectron.core.config import cfg +from detectron.datasets import json_dataset +from detectron.datasets import roidb as roidb_utils +import detectron.modeling.FPN as fpn +import detectron.roi_data.fast_rcnn as fast_rcnn_roi_data +import detectron.utils.blob as blob_utils + + +class CollectAndDistributeFpnRpnProposalsOp(object): + def __init__(self, train): + self._train = train + + def forward(self, inputs, outputs): + """See modeling.detector.CollectAndDistributeFpnRpnProposals for + inputs/outputs documentation. + """ + # inputs is + # [rpn_rois_fpn2, ..., rpn_rois_fpn6, + # rpn_roi_probs_fpn2, ..., rpn_roi_probs_fpn6] + # If training with Faster R-CNN, then inputs will additionally include + # + [roidb, im_info] + rois = collect(inputs, self._train) + if self._train: + # During training we reuse the data loader code. We populate roidb + # entries on the fly using the rois generated by RPN. + # im_info: [[im_height, im_width, im_scale], ...] + im_info = inputs[-1].data + im_scales = im_info[:, 2] + roidb = blob_utils.deserialize(inputs[-2].data) + # For historical consistency with the original Faster R-CNN + # implementation we are *not* filtering crowd proposals. + # This choice should be investigated in the future (it likely does + # not matter). + json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0) + roidb_utils.add_bbox_regression_targets(roidb) + # Compute training labels for the RPN proposals; also handles + # distributing the proposals over FPN levels + output_blob_names = fast_rcnn_roi_data.get_fast_rcnn_blob_names() + blobs = {k: [] for k in output_blob_names} + fast_rcnn_roi_data.add_fast_rcnn_blobs(blobs, im_scales, roidb) + for i, k in enumerate(output_blob_names): + blob_utils.py_op_copy_blob(blobs[k], outputs[i]) + else: + # For inference we have a special code path that avoids some data + # loader overhead + distribute(rois, None, outputs, self._train) + + +def collect(inputs, is_training): + cfg_key = 'TRAIN' if is_training else 'TEST' + post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N + k_max = cfg.FPN.RPN_MAX_LEVEL + k_min = cfg.FPN.RPN_MIN_LEVEL + num_lvls = k_max - k_min + 1 + roi_inputs = inputs[:num_lvls] + score_inputs = inputs[num_lvls:] + if is_training: + score_inputs = score_inputs[:-2] + + # rois are in [[batch_idx, x0, y0, x1, y2], ...] format + # Combine predictions across all levels and retain the top scoring + rois = np.concatenate([blob.data for blob in roi_inputs]) + scores = np.concatenate([blob.data for blob in score_inputs]).squeeze() + inds = np.argsort(-scores)[:post_nms_topN] + rois = rois[inds, :] + return rois + + +def distribute(rois, label_blobs, outputs, train): + """To understand the output blob order see return value of + detectron.roi_data.fast_rcnn.get_fast_rcnn_blob_names(is_training=False) + """ + lvl_min = cfg.FPN.ROI_MIN_LEVEL + lvl_max = cfg.FPN.ROI_MAX_LEVEL + lvls = fpn.map_rois_to_fpn_levels(rois[:, 1:5], lvl_min, lvl_max) + + outputs[0].reshape(rois.shape) + outputs[0].data[...] = rois + + # Create new roi blobs for each FPN level + # (See: modeling.FPN.add_multilevel_roi_blobs which is similar but annoying + # to generalize to support this particular case.) + rois_idx_order = np.empty((0, )) + for output_idx, lvl in enumerate(range(lvl_min, lvl_max + 1)): + idx_lvl = np.where(lvls == lvl)[0] + blob_roi_level = rois[idx_lvl, :] + outputs[output_idx + 1].reshape(blob_roi_level.shape) + outputs[output_idx + 1].data[...] = blob_roi_level + rois_idx_order = np.concatenate((rois_idx_order, idx_lvl)) + rois_idx_restore = np.argsort(rois_idx_order) + blob_utils.py_op_copy_blob(rois_idx_restore.astype(np.int32), outputs[-1]) diff --git a/detectron/ops/generate_proposal_labels.py b/detectron/ops/generate_proposal_labels.py new file mode 100644 index 0000000000000000000000000000000000000000..c1231c2f79b257ebc541a981836bd042fd4520d5 --- /dev/null +++ b/detectron/ops/generate_proposal_labels.py @@ -0,0 +1,54 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import logging + +from detectron.datasets import json_dataset +from detectron.datasets import roidb as roidb_utils +from detectron.utils import blob as blob_utils +import detectron.roi_data.fast_rcnn as fast_rcnn_roi_data + +logger = logging.getLogger(__name__) + + +class GenerateProposalLabelsOp(object): + + def forward(self, inputs, outputs): + """See modeling.detector.GenerateProposalLabels for inputs/outputs + documentation. + """ + # During training we reuse the data loader code. We populate roidb + # entries on the fly using the rois generated by RPN. + # im_info: [[im_height, im_width, im_scale], ...] + rois = inputs[0].data + roidb = blob_utils.deserialize(inputs[1].data) + im_info = inputs[2].data + im_scales = im_info[:, 2] + output_blob_names = fast_rcnn_roi_data.get_fast_rcnn_blob_names() + # For historical consistency with the original Faster R-CNN + # implementation we are *not* filtering crowd proposals. + # This choice should be investigated in the future (it likely does + # not matter). + json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0) + roidb_utils.add_bbox_regression_targets(roidb) + blobs = {k: [] for k in output_blob_names} + fast_rcnn_roi_data.add_fast_rcnn_blobs(blobs, im_scales, roidb) + for i, k in enumerate(output_blob_names): + blob_utils.py_op_copy_blob(blobs[k], outputs[i]) diff --git a/detectron/ops/generate_proposals.py b/detectron/ops/generate_proposals.py new file mode 100644 index 0000000000000000000000000000000000000000..7238c07a202d1ffd6237b4c831cc6bc38d24ac45 --- /dev/null +++ b/detectron/ops/generate_proposals.py @@ -0,0 +1,197 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Sean Bell +# -------------------------------------------------------- + +import numpy as np + +from detectron.core.config import cfg +import detectron.utils.boxes as box_utils + + +class GenerateProposalsOp(object): + """Output object detection proposals by applying estimated bounding-box + transformations to a set of regular boxes (called "anchors"). + + See comment in utils/boxes:bbox_transform_inv for details abouts the + optional `reg_weights` parameter. + """ + + def __init__(self, anchors, spatial_scale, train, reg_weights=(1.0, 1.0, 1.0, 1.0)): + self._anchors = anchors + self._num_anchors = self._anchors.shape[0] + self._feat_stride = 1. / spatial_scale + self._train = train + self._reg_weights = reg_weights + + def forward(self, inputs, outputs): + """See modeling.detector.GenerateProposals for inputs/outputs + documentation. + """ + # 1. for each location i in a (H, W) grid: + # generate A anchor boxes centered on cell i + # apply predicted bbox deltas to each of the A anchors at cell i + # 2. clip predicted boxes to image + # 3. remove predicted boxes with either height or width < threshold + # 4. sort all (proposal, score) pairs by score from highest to lowest + # 5. take the top pre_nms_topN proposals before NMS + # 6. apply NMS with a loose threshold (0.7) to the remaining proposals + # 7. take after_nms_topN proposals after NMS + # 8. return the top proposals + + # predicted probability of fg object for each RPN anchor + scores = inputs[0].data + # predicted achors transformations + bbox_deltas = inputs[1].data + # input image (height, width, scale), in which scale is the scale factor + # applied to the original dataset image to get the network input image + im_info = inputs[2].data + # 1. Generate proposals from bbox deltas and shifted anchors + height, width = scores.shape[-2:] + # Enumerate all shifted positions on the (H, W) grid + shift_x = np.arange(0, width) * self._feat_stride + shift_y = np.arange(0, height) * self._feat_stride + shift_x, shift_y = np.meshgrid(shift_x, shift_y, copy=False) + # Convert to (K, 4), K=H*W, where the columns are (dx, dy, dx, dy) + # shift pointing to each grid location + shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), + shift_x.ravel(), shift_y.ravel())).transpose() + + # Broacast anchors over shifts to enumerate all anchors at all positions + # in the (H, W) grid: + # - add A anchors of shape (1, A, 4) to + # - K shifts of shape (K, 1, 4) to get + # - all shifted anchors of shape (K, A, 4) + # - reshape to (K*A, 4) shifted anchors + num_images = inputs[0].shape[0] + A = self._num_anchors + K = shifts.shape[0] + all_anchors = self._anchors[np.newaxis, :, :] + shifts[:, np.newaxis, :] + all_anchors = all_anchors.reshape((K * A, 4)) + + rois = np.empty((0, 5), dtype=np.float32) + roi_probs = np.empty((0, 1), dtype=np.float32) + for im_i in range(num_images): + im_i_boxes, im_i_probs = self.proposals_for_one_image( + im_info[im_i, :], all_anchors, bbox_deltas[im_i, :, :, :], + scores[im_i, :, :, :] + ) + batch_inds = im_i * np.ones( + (im_i_boxes.shape[0], 1), dtype=np.float32 + ) + im_i_rois = np.hstack((batch_inds, im_i_boxes)) + rois = np.append(rois, im_i_rois, axis=0) + roi_probs = np.append(roi_probs, im_i_probs, axis=0) + + outputs[0].reshape(rois.shape) + outputs[0].data[...] = rois + if len(outputs) > 1: + outputs[1].reshape(roi_probs.shape) + outputs[1].data[...] = roi_probs + + def proposals_for_one_image( + self, im_info, all_anchors, bbox_deltas, scores + ): + # Get mode-dependent configuration + cfg_key = 'TRAIN' if self._train else 'TEST' + pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N + post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N + nms_thresh = cfg[cfg_key].RPN_NMS_THRESH + min_size = cfg[cfg_key].RPN_MIN_SIZE + # Transpose and reshape predicted bbox transformations to get them + # into the same order as the anchors: + # - bbox deltas will be (4 * A, H, W) format from conv output + # - transpose to (H, W, 4 * A) + # - reshape to (H * W * A, 4) where rows are ordered by (H, W, A) + # in slowest to fastest order to match the enumerated anchors + bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, 4)) + + # Same story for the scores: + # - scores are (A, H, W) format from conv output + # - transpose to (H, W, A) + # - reshape to (H * W * A, 1) where rows are ordered by (H, W, A) + # to match the order of anchors and bbox_deltas + scores = scores.transpose((1, 2, 0)).reshape((-1, 1)) + + # 4. sort all (proposal, score) pairs by score from highest to lowest + # 5. take top pre_nms_topN (e.g. 6000) + if pre_nms_topN <= 0 or pre_nms_topN >= len(scores): + order = np.argsort(-scores.squeeze()) + else: + # Avoid sorting possibly large arrays; First partition to get top K + # unsorted and then sort just those (~20x faster for 200k scores) + inds = np.argpartition( + -scores.squeeze(), pre_nms_topN + )[:pre_nms_topN] + order = np.argsort(-scores[inds].squeeze()) + order = inds[order] + bbox_deltas = bbox_deltas[order, :] + all_anchors = all_anchors[order, :] + scores = scores[order] + + # Transform anchors into proposals via bbox transformations + proposals = box_utils.bbox_transform(all_anchors, bbox_deltas, self._reg_weights) + + # 2. clip proposals to image (may result in proposals with zero area + # that will be removed in the next step) + proposals = box_utils.clip_tiled_boxes(proposals, im_info[:2]) + + # 3. remove predicted boxes with either height or width < min_size + keep = _filter_boxes(proposals, min_size, im_info) + proposals = proposals[keep, :] + scores = scores[keep] + + # 6. apply loose nms (e.g. threshold = 0.7) + # 7. take after_nms_topN (e.g. 300) + # 8. return the top proposals (-> RoIs top) + if nms_thresh > 0: + keep = box_utils.nms(np.hstack((proposals, scores)), nms_thresh) + if post_nms_topN > 0: + keep = keep[:post_nms_topN] + proposals = proposals[keep, :] + scores = scores[keep] + return proposals, scores + + +def _filter_boxes(boxes, min_size, im_info): + """Only keep boxes with both sides >= min_size and center within the image. + """ + # Compute the width and height of the proposal boxes as measured in the original + # image coordinate system (this is required to avoid "Negative Areas Found" + # assertions in other parts of the code that measure). + im_scale = im_info[2] + ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1 + hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1 + # To avoid numerical issues we require the min_size to be at least 1 pixel in the + # original image + min_size = np.maximum(min_size, 1) + # Proposal center is computed relative to the scaled input image + ws = boxes[:, 2] - boxes[:, 0] + 1 + hs = boxes[:, 3] - boxes[:, 1] + 1 + x_ctr = boxes[:, 0] + ws / 2. + y_ctr = boxes[:, 1] + hs / 2. + keep = np.where( + (ws_orig_scale >= min_size) + & (hs_orig_scale >= min_size) + & (x_ctr < im_info[1]) + & (y_ctr < im_info[0]) + )[0] + return keep diff --git a/detectron/ops/zero_even_op.cc b/detectron/ops/zero_even_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b77fb24d436f24b68016465cdbf44b9627944b4 --- /dev/null +++ b/detectron/ops/zero_even_op.cc @@ -0,0 +1,54 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "zero_even_op.h" + +namespace caffe2 { + +template <> +bool ZeroEvenOp::RunOnDevice() { + // Retrieve the input tensor. + const auto& X = Input(0); + CAFFE_ENFORCE(X.dim() == 1); + + // Initialize the output tensor to a copy of the input tensor. + auto* Y = Output(0); + Y->CopyFrom(X); + + // Set output elements at even indices to zero. + auto* Y_data = Y->mutable_data(); + for (auto i = 0; i < Y->numel(); i += 2) { + Y_data[i] = 0.0f; + } + + return true; +} + +REGISTER_CPU_OPERATOR(ZeroEven, ZeroEvenOp); + +OPERATOR_SCHEMA(ZeroEven) + .NumInputs(1) + .NumOutputs(1) + .Input( + 0, + "X", + "1D input tensor") + .Output( + 0, + "Y", + "1D output tensor"); + +} // namespace caffe2 diff --git a/detectron/ops/zero_even_op.cu b/detectron/ops/zero_even_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a606727d9a5a5c063116834c071553c4501c0f80 --- /dev/null +++ b/detectron/ops/zero_even_op.cu @@ -0,0 +1,64 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "caffe2/core/context_gpu.h" + +#include "zero_even_op.h" + +namespace caffe2 { + +namespace { + +template +__global__ void SetEvenIndsToVal(size_t num_even_inds, T val, T* data) { + CUDA_1D_KERNEL_LOOP(i, num_even_inds) { + data[i << 1] = val; + } +} + +} // namespace + +template <> +bool ZeroEvenOp::RunOnDevice() { + // Retrieve the input tensor. + const auto& X = Input(0); + CAFFE_ENFORCE(X.ndim() == 1); + + // Initialize the output tensor to a copy of the input tensor. + auto* Y = Output(0); + Y->CopyFrom(X); + + // Set output elements at even indices to zero. + auto output_size = Y->size(); + + if (output_size > 0) { + size_t num_even_inds = output_size / 2 + output_size % 2; + SetEvenIndsToVal + <<>>( + num_even_inds, + 0.0f, + Y->mutable_data()); + } + + return true; +} + +REGISTER_CUDA_OPERATOR(ZeroEven, ZeroEvenOp); + +} // namespace caffe2 diff --git a/detectron/ops/zero_even_op.h b/detectron/ops/zero_even_op.h new file mode 100644 index 0000000000000000000000000000000000000000..6aa3da8bc709d9f4c6870e8edfa0844510a1d10d --- /dev/null +++ b/detectron/ops/zero_even_op.h @@ -0,0 +1,46 @@ +/** + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ZERO_EVEN_OP_H_ +#define ZERO_EVEN_OP_H_ + +#include "caffe2/core/context.h" +#include "caffe2/core/operator.h" + +namespace caffe2 { + +/** + * ZeroEven operator. Zeros elements at even indices of an 1D array. + * Elements at odd indices are preserved. + * + * This toy operator is an example of a custom operator and may be a useful + * reference for adding new custom operators to the Detectron codebase. + */ +template +class ZeroEvenOp final : public Operator { + public: + // Introduce Operator helper members. + USE_OPERATOR_CONTEXT_FUNCTIONS; + + ZeroEvenOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override; +}; + +} // namespace caffe2 + +#endif // ZERO_EVEN_OP_H_ diff --git a/detectron/roi_data/__init__.py b/detectron/roi_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/detectron/roi_data/data_utils.py b/detectron/roi_data/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..54de8a35228534784ec52d58912e5d0ea42f0941 --- /dev/null +++ b/detectron/roi_data/data_utils.py @@ -0,0 +1,125 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Common utility functions for RPN and RetinaNet minibtach blobs preparation. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import namedtuple +import logging +import numpy as np +import threading + +from detectron.core.config import cfg +from detectron.modeling.generate_anchors import generate_anchors +import detectron.utils.boxes as box_utils + +logger = logging.getLogger(__name__) + + +# octave and aspect fields are only used on RetinaNet. Octave corresponds to the +# scale of the anchor and aspect denotes which aspect ratio is used in the range +# of aspect ratios +FieldOfAnchors = namedtuple( + 'FieldOfAnchors', [ + 'field_of_anchors', 'num_cell_anchors', 'stride', 'field_size', + 'octave', 'aspect' + ] +) + +# Cache for memoizing _get_field_of_anchors +_threadlocal_foa = threading.local() + + +def get_field_of_anchors( + stride, anchor_sizes, anchor_aspect_ratios, octave=None, aspect=None +): + global _threadlocal_foa + if not hasattr(_threadlocal_foa, 'cache'): + _threadlocal_foa.cache = {} + + cache_key = str(stride) + str(anchor_sizes) + str(anchor_aspect_ratios) + if cache_key in _threadlocal_foa.cache: + return _threadlocal_foa.cache[cache_key] + + # Anchors at a single feature cell + cell_anchors = generate_anchors( + stride=stride, sizes=anchor_sizes, aspect_ratios=anchor_aspect_ratios + ) + num_cell_anchors = cell_anchors.shape[0] + + # Generate canonical proposals from shifted anchors + # Enumerate all shifted positions on the (H, W) grid + fpn_max_size = cfg.FPN.COARSEST_STRIDE * np.ceil( + cfg.TRAIN.MAX_SIZE / float(cfg.FPN.COARSEST_STRIDE) + ) + field_size = int(np.ceil(fpn_max_size / float(stride))) + shifts = np.arange(0, field_size) * stride + shift_x, shift_y = np.meshgrid(shifts, shifts) + shift_x = shift_x.ravel() + shift_y = shift_y.ravel() + shifts = np.vstack((shift_x, shift_y, shift_x, shift_y)).transpose() + + # Broacast anchors over shifts to enumerate all anchors at all positions + # in the (H, W) grid: + # - add A cell anchors of shape (1, A, 4) to + # - K shifts of shape (K, 1, 4) to get + # - all shifted anchors of shape (K, A, 4) + # - reshape to (K*A, 4) shifted anchors + A = num_cell_anchors + K = shifts.shape[0] + field_of_anchors = ( + cell_anchors.reshape((1, A, 4)) + + shifts.reshape((1, K, 4)).transpose((1, 0, 2)) + ) + field_of_anchors = field_of_anchors.reshape((K * A, 4)) + foa = FieldOfAnchors( + field_of_anchors=field_of_anchors.astype(np.float32), + num_cell_anchors=num_cell_anchors, + stride=stride, + field_size=field_size, + octave=octave, + aspect=aspect + ) + _threadlocal_foa.cache[cache_key] = foa + return foa + + +def unmap(data, count, inds, fill=0): + """Unmap a subset of item (data) back to the original set of items (of + size count)""" + if count == len(inds): + return data + + if len(data.shape) == 1: + ret = np.empty((count, ), dtype=data.dtype) + ret.fill(fill) + ret[inds] = data + else: + ret = np.empty((count, ) + data.shape[1:], dtype=data.dtype) + ret.fill(fill) + ret[inds, :] = data + return ret + + +def compute_targets(ex_rois, gt_rois, weights=(1.0, 1.0, 1.0, 1.0)): + """Compute bounding-box regression targets for an image.""" + return box_utils.bbox_transform_inv(ex_rois, gt_rois, weights).astype( + np.float32, copy=False + ) diff --git a/detectron/roi_data/fast_rcnn.py b/detectron/roi_data/fast_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..56e96987b7ea2fa4f4ba11573d24ccb687707ae5 --- /dev/null +++ b/detectron/roi_data/fast_rcnn.py @@ -0,0 +1,265 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Construct minibatches for Fast R-CNN training. Handles the minibatch blobs +that are specific to Fast R-CNN. Other blobs that are generic to RPN, etc. +are handled by their respecitive roi_data modules. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import logging +import numpy as np +import numpy.random as npr + +from detectron.core.config import cfg +import detectron.modeling.FPN as fpn +import detectron.roi_data.keypoint_rcnn as keypoint_rcnn_roi_data +import detectron.roi_data.mask_rcnn as mask_rcnn_roi_data +import detectron.utils.blob as blob_utils +import detectron.utils.boxes as box_utils + +logger = logging.getLogger(__name__) + + +def get_fast_rcnn_blob_names(is_training=True): + """Fast R-CNN blob names.""" + # rois blob: holds R regions of interest, each is a 5-tuple + # (batch_idx, x1, y1, x2, y2) specifying an image batch index and a + # rectangle (x1, y1, x2, y2) + blob_names = ['rois'] + if is_training: + # labels_int32 blob: R categorical labels in [0, ..., K] for K + # foreground classes plus background + blob_names += ['labels_int32'] + if is_training: + # bbox_targets blob: R bounding-box regression targets with 4 + # targets per class + blob_names += ['bbox_targets'] + # bbox_inside_weights blob: At most 4 targets per roi are active + # this binary vector sepcifies the subset of active targets + blob_names += ['bbox_inside_weights'] + blob_names += ['bbox_outside_weights'] + if is_training and cfg.MODEL.MASK_ON: + # 'mask_rois': RoIs sampled for training the mask prediction branch. + # Shape is (#masks, 5) in format (batch_idx, x1, y1, x2, y2). + blob_names += ['mask_rois'] + # 'roi_has_mask': binary labels for the RoIs specified in 'rois' + # indicating if each RoI has a mask or not. Note that in some cases + # a *bg* RoI will have an all -1 (ignore) mask associated with it in + # the case that no fg RoIs can be sampled. Shape is (batchsize). + blob_names += ['roi_has_mask_int32'] + # 'masks_int32' holds binary masks for the RoIs specified in + # 'mask_rois'. Shape is (#fg, M * M) where M is the ground truth + # mask size. + blob_names += ['masks_int32'] + if is_training and cfg.MODEL.KEYPOINTS_ON: + # 'keypoint_rois': RoIs sampled for training the keypoint prediction + # branch. Shape is (#instances, 5) in format (batch_idx, x1, y1, x2, + # y2). + blob_names += ['keypoint_rois'] + # 'keypoint_locations_int32': index of keypoint in + # KRCNN.HEATMAP_SIZE**2 sized array. Shape is (#instances). Used in + # SoftmaxWithLoss. + blob_names += ['keypoint_locations_int32'] + # 'keypoint_weights': weight assigned to each target in + # 'keypoint_locations_int32'. Shape is (#instances). Used in + # SoftmaxWithLoss. + blob_names += ['keypoint_weights'] + # 'keypoint_loss_normalizer': optional normalization factor to use if + # cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS is False. + blob_names += ['keypoint_loss_normalizer'] + if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: + # Support for FPN multi-level rois without bbox reg isn't + # implemented (... and may never be implemented) + k_max = cfg.FPN.ROI_MAX_LEVEL + k_min = cfg.FPN.ROI_MIN_LEVEL + # Same format as rois blob, but one per FPN level + for lvl in range(k_min, k_max + 1): + blob_names += ['rois_fpn' + str(lvl)] + blob_names += ['rois_idx_restore_int32'] + if is_training: + if cfg.MODEL.MASK_ON: + for lvl in range(k_min, k_max + 1): + blob_names += ['mask_rois_fpn' + str(lvl)] + blob_names += ['mask_rois_idx_restore_int32'] + if cfg.MODEL.KEYPOINTS_ON: + for lvl in range(k_min, k_max + 1): + blob_names += ['keypoint_rois_fpn' + str(lvl)] + blob_names += ['keypoint_rois_idx_restore_int32'] + return blob_names + + +def add_fast_rcnn_blobs(blobs, im_scales, roidb): + """Add blobs needed for training Fast R-CNN style models.""" + # Sample training RoIs from each image and append them to the blob lists + for im_i, entry in enumerate(roidb): + frcn_blobs = _sample_rois(entry, im_scales[im_i], im_i) + for k, v in frcn_blobs.items(): + blobs[k].append(v) + # Concat the training blob lists into tensors + for k, v in blobs.items(): + if isinstance(v, list) and len(v) > 0: + blobs[k] = np.concatenate(v) + # Add FPN multilevel training RoIs, if configured + if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: + _add_multilevel_rois(blobs) + + # Perform any final work and validity checks after the collating blobs for + # all minibatch images + valid = True + if cfg.MODEL.KEYPOINTS_ON: + valid = keypoint_rcnn_roi_data.finalize_keypoint_minibatch(blobs, valid) + + return valid + + +def _sample_rois(roidb, im_scale, batch_idx): + """Generate a random sample of RoIs comprising foreground and background + examples. + """ + rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) + fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) + max_overlaps = roidb['max_overlaps'] + + # Select foreground RoIs as those with >= FG_THRESH overlap + fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] + # Guard against the case when an image has fewer than fg_rois_per_image + # foreground RoIs + fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) + # Sample foreground regions without replacement + if fg_inds.size > 0: + fg_inds = npr.choice( + fg_inds, size=fg_rois_per_this_image, replace=False + ) + + # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) + bg_inds = np.where( + (max_overlaps < cfg.TRAIN.BG_THRESH_HI) & + (max_overlaps >= cfg.TRAIN.BG_THRESH_LO) + )[0] + # Compute number of background RoIs to take from this image (guarding + # against there being fewer than desired) + bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image + bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) + # Sample foreground regions without replacement + if bg_inds.size > 0: + bg_inds = npr.choice( + bg_inds, size=bg_rois_per_this_image, replace=False + ) + + # The indices that we're selecting (both fg and bg) + keep_inds = np.append(fg_inds, bg_inds) + # Label is the class each RoI has max overlap with + sampled_labels = roidb['max_classes'][keep_inds] + sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 + sampled_boxes = roidb['boxes'][keep_inds] + + bbox_targets, bbox_inside_weights = _expand_bbox_targets( + roidb['bbox_targets'][keep_inds, :] + ) + bbox_outside_weights = np.array( + bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype + ) + + # Scale rois and format as (batch_idx, x1, y1, x2, y2) + sampled_rois = sampled_boxes * im_scale + repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1)) + sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) + + # Base Fast R-CNN blobs + blob_dict = dict( + labels_int32=sampled_labels.astype(np.int32, copy=False), + rois=sampled_rois, + bbox_targets=bbox_targets, + bbox_inside_weights=bbox_inside_weights, + bbox_outside_weights=bbox_outside_weights + ) + + # Optionally add Mask R-CNN blobs + if cfg.MODEL.MASK_ON: + mask_rcnn_roi_data.add_mask_rcnn_blobs( + blob_dict, sampled_boxes, roidb, im_scale, batch_idx + ) + + # Optionally add Keypoint R-CNN blobs + if cfg.MODEL.KEYPOINTS_ON: + keypoint_rcnn_roi_data.add_keypoint_rcnn_blobs( + blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx + ) + + return blob_dict + + +def _expand_bbox_targets(bbox_target_data): + """Bounding-box regression targets are stored in a compact form in the + roidb. + + This function expands those targets into the 4-of-4*K representation used + by the network (i.e. only one class has non-zero targets). The loss weights + are similarly expanded. + + Returns: + bbox_target_data (ndarray): N x 4K blob of regression targets + bbox_inside_weights (ndarray): N x 4K blob of loss weights + """ + num_bbox_reg_classes = cfg.MODEL.NUM_CLASSES + if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG: + num_bbox_reg_classes = 2 # bg and fg + + clss = bbox_target_data[:, 0] + bbox_targets = blob_utils.zeros((clss.size, 4 * num_bbox_reg_classes)) + bbox_inside_weights = blob_utils.zeros(bbox_targets.shape) + inds = np.where(clss > 0)[0] + for ind in inds: + cls = int(clss[ind]) + start = 4 * cls + end = start + 4 + bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] + bbox_inside_weights[ind, start:end] = (1.0, 1.0, 1.0, 1.0) + return bbox_targets, bbox_inside_weights + + +def _add_multilevel_rois(blobs): + """By default training RoIs are added for a single feature map level only. + When using FPN, the RoIs must be distributed over different FPN levels + according the level assignment heuristic (see: modeling.FPN. + map_rois_to_fpn_levels). + """ + lvl_min = cfg.FPN.ROI_MIN_LEVEL + lvl_max = cfg.FPN.ROI_MAX_LEVEL + + def _distribute_rois_over_fpn_levels(rois_blob_name): + """Distribute rois over the different FPN levels.""" + # Get target level for each roi + # Recall blob rois are in (batch_idx, x1, y1, x2, y2) format, hence take + # the box coordinates from columns 1:5 + target_lvls = fpn.map_rois_to_fpn_levels( + blobs[rois_blob_name][:, 1:5], lvl_min, lvl_max + ) + # Add per FPN level roi blobs named like: _fpn + fpn.add_multilevel_roi_blobs( + blobs, rois_blob_name, blobs[rois_blob_name], target_lvls, lvl_min, + lvl_max + ) + + _distribute_rois_over_fpn_levels('rois') + if cfg.MODEL.MASK_ON: + _distribute_rois_over_fpn_levels('mask_rois') + if cfg.MODEL.KEYPOINTS_ON: + _distribute_rois_over_fpn_levels('keypoint_rois') diff --git a/detectron/roi_data/keypoint_rcnn.py b/detectron/roi_data/keypoint_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..b1aa3dd54f057f97fb255380fbd518bfbc113f20 --- /dev/null +++ b/detectron/roi_data/keypoint_rcnn.py @@ -0,0 +1,129 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Construct minibatches for Mask R-CNN training when keypoints are enabled. +Handles the minibatch blobs that are specific to training Mask R-CNN for +keypoint detection. Other blobs that are generic to RPN or Fast/er R-CNN are +handled by their respecitive roi_data modules. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import logging +import numpy as np + +from detectron.core.config import cfg +import detectron.utils.blob as blob_utils +import detectron.utils.keypoints as keypoint_utils + +logger = logging.getLogger(__name__) + + +def add_keypoint_rcnn_blobs( + blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx +): + """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary.""" + # Note: gt_inds must match how they're computed in + # datasets.json_dataset._merge_proposal_boxes_into_roidb + gt_inds = np.where(roidb['gt_classes'] > 0)[0] + max_overlaps = roidb['max_overlaps'] + gt_keypoints = roidb['gt_keypoints'] + + ind_kp = gt_inds[roidb['box_to_gt_ind_map']] + within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) + vis_kp = gt_keypoints[ind_kp, 2, :] > 0 + is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 + kp_fg_inds = np.where( + np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible) + )[0] + + kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size) + if kp_fg_inds.size > kp_fg_rois_per_this_image: + kp_fg_inds = np.random.choice( + kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False + ) + + sampled_fg_rois = roidb['boxes'][kp_fg_inds] + box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] + + num_keypoints = gt_keypoints.shape[2] + sampled_keypoints = -np.ones( + (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints), + dtype=gt_keypoints.dtype + ) + for ii in range(len(sampled_fg_rois)): + ind = box_to_gt_ind_map[ii] + if ind >= 0: + sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] + assert np.sum(sampled_keypoints[ii, 2, :]) > 0 + + heats, weights = keypoint_utils.keypoints_to_heatmap_labels( + sampled_keypoints, sampled_fg_rois + ) + + shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1) + heats = heats.reshape(shape) + weights = weights.reshape(shape) + + sampled_fg_rois *= im_scale + repeated_batch_idx = batch_idx * blob_utils.ones( + (sampled_fg_rois.shape[0], 1) + ) + sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) + + blobs['keypoint_rois'] = sampled_fg_rois + blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False) + blobs['keypoint_weights'] = weights + + +def finalize_keypoint_minibatch(blobs, valid): + """Finalize the minibatch after blobs for all minibatch images have been + collated. + """ + min_count = cfg.KRCNN.MIN_KEYPOINT_COUNT_FOR_VALID_MINIBATCH + num_visible_keypoints = np.sum(blobs['keypoint_weights']) + valid = ( + valid and len(blobs['keypoint_weights']) > 0 and + num_visible_keypoints > min_count + ) + # Normalizer to use if cfg.KRCNN.NORMALIZE_BY_VISIBLE_KEYPOINTS is False. + # See modeling.model_builder.add_keypoint_losses + norm = num_visible_keypoints / ( + cfg.TRAIN.IMS_PER_BATCH * cfg.TRAIN.BATCH_SIZE_PER_IM * + cfg.TRAIN.FG_FRACTION * cfg.KRCNN.NUM_KEYPOINTS + ) + blobs['keypoint_loss_normalizer'] = np.array(norm, dtype=np.float32) + return valid + + +def _within_box(points, boxes): + """Validate which keypoints are contained inside a given box. + + points: Nx2xK + boxes: Nx4 + output: NxK + """ + x_within = np.logical_and( + points[:, 0, :] >= np.expand_dims(boxes[:, 0], axis=1), + points[:, 0, :] <= np.expand_dims(boxes[:, 2], axis=1) + ) + y_within = np.logical_and( + points[:, 1, :] >= np.expand_dims(boxes[:, 1], axis=1), + points[:, 1, :] <= np.expand_dims(boxes[:, 3], axis=1) + ) + return np.logical_and(x_within, y_within) diff --git a/detectron/roi_data/loader.py b/detectron/roi_data/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..71cb18f5e9f1f2179f16f5f04f358dfc929ebc8e --- /dev/null +++ b/detectron/roi_data/loader.py @@ -0,0 +1,295 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Detectron data loader. The design is generic and abstracted away from any +details of the minibatch. A minibatch is a dictionary of blob name keys and +their associated numpy (float32 or int32) ndarray values. + +Outline of the data loader design: + +loader thread\ +loader thread \ / GPU 1 enqueue thread -> feed -> EnqueueOp +... -> minibatch queue -> ... +loader thread / \ GPU N enqueue thread -> feed -> EnqueueOp +loader thread/ + +<---------------------------- CPU -----------------------------|---- GPU ----> + +A pool of loader threads construct minibatches that are put onto the shared +minibatch queue. Each GPU has an enqueue thread that pulls a minibatch off the +minibatch queue, feeds the minibatch blobs into the workspace, and then runs +an EnqueueBlobsOp to place the minibatch blobs into the GPU's blobs queue. +During each fprop the first thing the network does is run a DequeueBlobsOp +in order to populate the workspace with the blobs from a queued minibatch. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import deque +from collections import OrderedDict +import logging +import numpy as np +import signal +import threading +import time +import uuid +from six.moves import queue as Queue + +from caffe2.python import core, workspace + +from detectron.core.config import cfg +from detectron.roi_data.minibatch import get_minibatch +from detectron.roi_data.minibatch import get_minibatch_blob_names +from detectron.utils.coordinator import coordinated_get +from detectron.utils.coordinator import coordinated_put +from detectron.utils.coordinator import Coordinator +import detectron.utils.c2 as c2_utils + +logger = logging.getLogger(__name__) + + +class RoIDataLoader(object): + def __init__( + self, + roidb, + num_loaders=4, + minibatch_queue_size=64, + blobs_queue_capacity=8 + ): + self._roidb = roidb + self._lock = threading.Lock() + self._perm = deque(range(len(self._roidb))) + self._cur = 0 # _perm cursor + # The minibatch queue holds prepared training data in host (CPU) memory + # When training with N > 1 GPUs, each element in the minibatch queue + # is actually a partial minibatch which contributes 1 / N of the + # examples to the overall minibatch + self._minibatch_queue = Queue.Queue(maxsize=minibatch_queue_size) + self._blobs_queue_capacity = blobs_queue_capacity + # Random queue name in case one instantiates multple RoIDataLoaders + self._loader_id = uuid.uuid4() + self._blobs_queue_name = 'roi_blobs_queue_{}'.format(self._loader_id) + # Loader threads construct (partial) minibatches and put them on the + # minibatch queue + self._num_loaders = num_loaders + self._num_gpus = cfg.NUM_GPUS + self.coordinator = Coordinator() + + self._output_names = get_minibatch_blob_names() + self._shuffle_roidb_inds() + self.create_threads() + + def minibatch_loader_thread(self): + """Load mini-batches and put them onto the mini-batch queue.""" + with self.coordinator.stop_on_exception(): + while not self.coordinator.should_stop(): + blobs = self.get_next_minibatch() + # Blobs must be queued in the order specified by + # self.get_output_names + ordered_blobs = OrderedDict() + for key in self.get_output_names(): + assert blobs[key].dtype in (np.int32, np.float32), \ + 'Blob {} of dtype {} must have dtype of ' \ + 'np.int32 or np.float32'.format(key, blobs[key].dtype) + ordered_blobs[key] = blobs[key] + coordinated_put( + self.coordinator, self._minibatch_queue, ordered_blobs + ) + logger.info('Stopping mini-batch loading thread') + + def enqueue_blobs_thread(self, gpu_id, blob_names): + """Transfer mini-batches from a mini-batch queue to a BlobsQueue.""" + with self.coordinator.stop_on_exception(): + while not self.coordinator.should_stop(): + if self._minibatch_queue.qsize == 0: + logger.warning('Mini-batch queue is empty') + blobs = coordinated_get(self.coordinator, self._minibatch_queue) + self.enqueue_blobs(gpu_id, blob_names, blobs.values()) + logger.debug( + 'batch queue size {}'.format(self._minibatch_queue.qsize()) + ) + logger.info('Stopping enqueue thread') + + def get_next_minibatch(self): + """Return the blobs to be used for the next minibatch. Thread safe.""" + valid = False + while not valid: + db_inds = self._get_next_minibatch_inds() + minibatch_db = [self._roidb[i] for i in db_inds] + blobs, valid = get_minibatch(minibatch_db) + return blobs + + def _shuffle_roidb_inds(self): + """Randomly permute the training roidb. Not thread safe.""" + if cfg.TRAIN.ASPECT_GROUPING: + widths = np.array([r['width'] for r in self._roidb]) + heights = np.array([r['height'] for r in self._roidb]) + horz = (widths >= heights) + vert = np.logical_not(horz) + horz_inds = np.where(horz)[0] + vert_inds = np.where(vert)[0] + + horz_inds = np.random.permutation(horz_inds) + vert_inds = np.random.permutation(vert_inds) + mb = cfg.TRAIN.IMS_PER_BATCH + horz_inds = horz_inds[:(len(horz_inds) // mb) * mb] + vert_inds = vert_inds[:(len(vert_inds) // mb) * mb] + inds = np.hstack((horz_inds, vert_inds)) + + inds = np.reshape(inds, (-1, mb)) + row_perm = np.random.permutation(np.arange(inds.shape[0])) + inds = np.reshape(inds[row_perm, :], (-1, )) + self._perm = inds + else: + self._perm = np.random.permutation(np.arange(len(self._roidb))) + self._perm = deque(self._perm) + self._cur = 0 + + def _get_next_minibatch_inds(self): + """Return the roidb indices for the next minibatch. Thread safe.""" + with self._lock: + # We use a deque and always take the *first* IMS_PER_BATCH items + # followed by *rotating* the deque so that we see fresh items + # each time. If the length of _perm is not divisible by + # IMS_PER_BATCH, then we end up wrapping around the permutation. + db_inds = [self._perm[i] for i in range(cfg.TRAIN.IMS_PER_BATCH)] + self._perm.rotate(-cfg.TRAIN.IMS_PER_BATCH) + self._cur += cfg.TRAIN.IMS_PER_BATCH + if self._cur >= len(self._perm): + self._shuffle_roidb_inds() + return db_inds + + def get_output_names(self): + return self._output_names + + def enqueue_blobs(self, gpu_id, blob_names, blobs): + """Put a mini-batch on a BlobsQueue.""" + assert len(blob_names) == len(blobs) + t = time.time() + dev = c2_utils.CudaDevice(gpu_id) + queue_name = 'gpu_{}/{}'.format(gpu_id, self._blobs_queue_name) + blob_names = ['gpu_{}/{}'.format(gpu_id, b) for b in blob_names] + for (blob_name, blob) in zip(blob_names, blobs): + workspace.FeedBlob(blob_name, blob, device_option=dev) + logger.debug( + 'enqueue_blobs {}: workspace.FeedBlob: {}'. + format(gpu_id, time.time() - t) + ) + t = time.time() + op = core.CreateOperator( + 'SafeEnqueueBlobs', [queue_name] + blob_names, + blob_names + [queue_name + '_enqueue_status'], + device_option=dev + ) + workspace.RunOperatorOnce(op) + logger.debug( + 'enqueue_blobs {}: workspace.RunOperatorOnce: {}'. + format(gpu_id, time.time() - t) + ) + + def create_threads(self): + # Create mini-batch loader threads, each of which builds mini-batches + # and places them into a queue in CPU memory + self._workers = [ + threading.Thread(target=self.minibatch_loader_thread) + for _ in range(self._num_loaders) + ] + + # Create one BlobsQueue per GPU + # (enqueue_blob_names are unscoped) + enqueue_blob_names = self.create_blobs_queues() + + # Create one enqueuer thread per GPU + self._enqueuers = [ + threading.Thread( + target=self.enqueue_blobs_thread, + args=(gpu_id, enqueue_blob_names) + ) for gpu_id in range(self._num_gpus) + ] + + def start(self, prefill=False): + for w in self._workers + self._enqueuers: + w.setDaemon(True) + w.start() + if prefill: + logger.info('Pre-filling mini-batch queue...') + while not self._minibatch_queue.full(): + logger.info( + ' [{:d}/{:d}]'.format( + self._minibatch_queue.qsize(), + self._minibatch_queue.maxsize + ) + ) + time.sleep(0.1) + # Detect failure and shutdown + if self.coordinator.should_stop(): + self.shutdown() + break + + def has_stopped(self): + return self.coordinator.should_stop() + + def shutdown(self): + self.coordinator.request_stop() + self.coordinator.wait_for_stop() + self.close_blobs_queues() + for w in self._workers + self._enqueuers: + w.join() + + def create_blobs_queues(self): + """Create one BlobsQueue for each GPU to hold mini-batches.""" + for gpu_id in range(self._num_gpus): + with c2_utils.GpuNameScope(gpu_id): + workspace.RunOperatorOnce( + core.CreateOperator( + 'CreateBlobsQueue', [], [self._blobs_queue_name], + num_blobs=len(self.get_output_names()), + capacity=self._blobs_queue_capacity + ) + ) + return self.create_enqueue_blobs() + + def close_blobs_queues(self): + """Close a BlobsQueue.""" + for gpu_id in range(self._num_gpus): + with core.NameScope('gpu_{}'.format(gpu_id)): + workspace.RunOperatorOnce( + core.CreateOperator( + 'CloseBlobsQueue', [self._blobs_queue_name], [] + ) + ) + + def create_enqueue_blobs(self): + blob_names = self.get_output_names() + enqueue_blob_names = [ + '{}_enqueue_{}'.format(b, self._loader_id) for b in blob_names + ] + for gpu_id in range(self._num_gpus): + with c2_utils.NamedCudaScope(gpu_id): + for blob in enqueue_blob_names: + workspace.CreateBlob(core.ScopedName(blob)) + return enqueue_blob_names + + def register_sigint_handler(self): + def signal_handler(signal, frame): + logger.info( + 'SIGINT: Shutting down RoIDataLoader threads and exiting...' + ) + self.shutdown() + + signal.signal(signal.SIGINT, signal_handler) diff --git a/detectron/roi_data/mask_rcnn.py b/detectron/roi_data/mask_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..8be01bb39e5149624302b40735c80d3c3e039561 --- /dev/null +++ b/detectron/roi_data/mask_rcnn.py @@ -0,0 +1,126 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Construct minibatches for Mask R-CNN training. Handles the minibatch blobs +that are specific to Mask R-CNN. Other blobs that are generic to RPN or +Fast/er R-CNN are handled by their respecitive roi_data modules. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import logging +import numpy as np + +from detectron.core.config import cfg +import detectron.utils.blob as blob_utils +import detectron.utils.boxes as box_utils +import detectron.utils.segms as segm_utils + +logger = logging.getLogger(__name__) + + +def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx): + """Add Mask R-CNN specific blobs to the input blob dictionary.""" + # Prepare the mask targets by associating one gt mask to each training roi + # that has a fg (non-bg) class label. + M = cfg.MRCNN.RESOLUTION + polys_gt_inds = np.where( + (roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0) + )[0] + polys_gt = [roidb['segms'][i] for i in polys_gt_inds] + boxes_from_polys = segm_utils.polys_to_boxes(polys_gt) + fg_inds = np.where(blobs['labels_int32'] > 0)[0] + roi_has_mask = blobs['labels_int32'].copy() + roi_has_mask[roi_has_mask > 0] = 1 + + if fg_inds.shape[0] > 0: + # Class labels for the foreground rois + mask_class_labels = blobs['labels_int32'][fg_inds] + masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True) + + # Find overlap between all foreground rois and the bounding boxes + # enclosing each segmentation + rois_fg = sampled_boxes[fg_inds] + overlaps_bbfg_bbpolys = box_utils.bbox_overlaps( + rois_fg.astype(np.float32, copy=False), + boxes_from_polys.astype(np.float32, copy=False) + ) + # Map from each fg rois to the index of the mask with highest overlap + # (measured by bbox overlap) + fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1) + + # add fg targets + for i in range(rois_fg.shape[0]): + fg_polys_ind = fg_polys_inds[i] + poly_gt = polys_gt[fg_polys_ind] + roi_fg = rois_fg[i] + # Rasterize the portion of the polygon mask within the given fg roi + # to an M x M binary image + mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M) + mask = np.array(mask > 0, dtype=np.int32) # Ensure it's binary + masks[i, :] = np.reshape(mask, M**2) + else: # If there are no fg masks (it does happen) + # The network cannot handle empty blobs, so we must provide a mask + # We simply take the first bg roi, given it an all -1's mask (ignore + # label), and label it with class zero (bg). + bg_inds = np.where(blobs['labels_int32'] == 0)[0] + # rois_fg is actually one background roi, but that's ok because ... + rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1)) + # We give it an -1's blob (ignore label) + masks = -blob_utils.ones((1, M**2), int32=True) + # We label it with class = 0 (background) + mask_class_labels = blob_utils.zeros((1, )) + # Mark that the first roi has a mask + roi_has_mask[0] = 1 + + if cfg.MRCNN.CLS_SPECIFIC_MASK: + masks = _expand_to_class_specific_mask_targets(masks, mask_class_labels) + + # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2) + rois_fg *= im_scale + repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1)) + rois_fg = np.hstack((repeated_batch_idx, rois_fg)) + + # Update blobs dict with Mask R-CNN blobs + blobs['mask_rois'] = rois_fg + blobs['roi_has_mask_int32'] = roi_has_mask + blobs['masks_int32'] = masks + + +def _expand_to_class_specific_mask_targets(masks, mask_class_labels): + """Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2) + to encode class specific mask targets. + """ + assert masks.shape[0] == mask_class_labels.shape[0] + M = cfg.MRCNN.RESOLUTION + + # Target values of -1 are "don't care" / ignore labels + mask_targets = -blob_utils.ones( + (masks.shape[0], cfg.MODEL.NUM_CLASSES * M**2), int32=True + ) + + for i in range(masks.shape[0]): + cls = int(mask_class_labels[i]) + start = M**2 * cls + end = start + M**2 + # Ignore background instance + # (only happens when there is no fg samples in an image) + if cls > 0: + mask_targets[i, start:end] = masks[i, :] + + return mask_targets diff --git a/detectron/roi_data/minibatch.py b/detectron/roi_data/minibatch.py new file mode 100644 index 0000000000000000000000000000000000000000..a680be92a1e9a9e939004ec25bafa293258a0e14 --- /dev/null +++ b/detectron/roi_data/minibatch.py @@ -0,0 +1,116 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Construct minibatches for Detectron networks.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import logging +import numpy as np + +from detectron.core.config import cfg +import detectron.roi_data.fast_rcnn as fast_rcnn_roi_data +import detectron.roi_data.retinanet as retinanet_roi_data +import detectron.roi_data.rpn as rpn_roi_data +import detectron.utils.blob as blob_utils + +logger = logging.getLogger(__name__) + + +def get_minibatch_blob_names(is_training=True): + """Return blob names in the order in which they are read by the data loader. + """ + # data blob: holds a batch of N images, each with 3 channels + blob_names = ['data'] + if cfg.RPN.RPN_ON: + # RPN-only or end-to-end Faster R-CNN + blob_names += rpn_roi_data.get_rpn_blob_names(is_training=is_training) + elif cfg.RETINANET.RETINANET_ON: + blob_names += retinanet_roi_data.get_retinanet_blob_names( + is_training=is_training + ) + else: + # Fast R-CNN like models trained on precomputed proposals + blob_names += fast_rcnn_roi_data.get_fast_rcnn_blob_names( + is_training=is_training + ) + return blob_names + + +def get_minibatch(roidb): + """Given a roidb, construct a minibatch sampled from it.""" + # We collect blobs from each image onto a list and then concat them into a + # single tensor, hence we initialize each blob to an empty list + blobs = {k: [] for k in get_minibatch_blob_names()} + # Get the input image blob, formatted for caffe2 + im_blob, im_scales = _get_image_blob(roidb) + blobs['data'] = im_blob + if cfg.RPN.RPN_ON: + # RPN-only or end-to-end Faster/Mask R-CNN + valid = rpn_roi_data.add_rpn_blobs(blobs, im_scales, roidb) + elif cfg.RETINANET.RETINANET_ON: + im_width, im_height = im_blob.shape[3], im_blob.shape[2] + # im_width, im_height corresponds to the network input: padded image + # (if needed) width and height. We pass it as input and slice the data + # accordingly so that we don't need to use SampleAsOp + valid = retinanet_roi_data.add_retinanet_blobs( + blobs, im_scales, roidb, im_width, im_height + ) + else: + # Fast R-CNN like models trained on precomputed proposals + valid = fast_rcnn_roi_data.add_fast_rcnn_blobs(blobs, im_scales, roidb) + return blobs, valid + + +def _get_image_blob(roidb): + """Builds an input blob from the images in the roidb at the specified + scales. + """ + num_images = len(roidb) + # Sample random scales to use for each image in this batch + scale_inds = np.random.randint( + 0, high=len(cfg.TRAIN.SCALES), size=num_images + ) + processed_ims = [] + im_scales = [] + for i in range(num_images): + im = cv2.imread(roidb[i]['image']) + assert im is not None, \ + 'Failed to read image \'{}\''.format(roidb[i]['image']) + if roidb[i]['flipped']: + im = im[:, ::-1, :] + target_size = cfg.TRAIN.SCALES[scale_inds[i]] + im, im_scale = blob_utils.prep_im_for_blob( + im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE + ) + im_scales.append(im_scale) + processed_ims.append(im) + + # Create a blob to hold the input images + blob = blob_utils.im_list_to_blob(processed_ims) + + return blob, im_scales diff --git a/detectron/roi_data/retinanet.py b/detectron/roi_data/retinanet.py new file mode 100644 index 0000000000000000000000000000000000000000..b737042cd022913233be6f52432ae382353dee4a --- /dev/null +++ b/detectron/roi_data/retinanet.py @@ -0,0 +1,288 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Compute minibatch blobs for training a RetinaNet network.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import logging + +import detectron.utils.boxes as box_utils +import detectron.roi_data.data_utils as data_utils +from detectron.core.config import cfg + + +logger = logging.getLogger(__name__) + + +def get_retinanet_blob_names(is_training=True): + """ + Returns blob names in the order in which they are read by the data + loader. + + N = number of images per minibatch + A = number of anchors = num_scales * num_aspect_ratios + (for example 9 used in RetinaNet paper) + H, W = spatial dimensions (different for each FPN level) + M = Out of all the anchors generated, depending on the positive/negative IoU + overlap thresholds, we will have M positive anchors. These are the anchors + that bounding box branch will regress on. + + retnet_cls_labels -> labels for the cls branch for each FPN level + Shape: N x A x H x W + + retnet_roi_bbox_targets -> targets for the bbox regression branch + Shape: M x 4 + + retnet_roi_fg_bbox_locs -> for the bbox regression, since we are only + interested in regressing on fg bboxes which are + M in number and the output prediction of the network + is of shape N x (A * 4) x H x W + (in case of non class-specific bbox), so we + store the locations of positive fg boxes in this + blob retnet_roi_fg_bbox_locs of shape M x 4 where + each row looks like: [img_id, anchor_id, x_loc, y_loc] + """ + # im_info: (height, width, image scale) + blob_names = ['im_info'] + assert cfg.FPN.FPN_ON, "RetinaNet uses FPN for dense detection" + # Same format as RPN blobs, but one per FPN level + if is_training: + blob_names += ['retnet_fg_num', 'retnet_bg_num'] + for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1): + suffix = 'fpn{}'.format(lvl) + blob_names += [ + 'retnet_cls_labels_' + suffix, + 'retnet_roi_bbox_targets_' + suffix, + 'retnet_roi_fg_bbox_locs_' + suffix, + ] + return blob_names + + +def add_retinanet_blobs(blobs, im_scales, roidb, image_width, image_height): + """Add RetinaNet blobs.""" + # RetinaNet is applied to many feature levels, as in the FPN paper + k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL + scales_per_octave = cfg.RETINANET.SCALES_PER_OCTAVE + num_aspect_ratios = len(cfg.RETINANET.ASPECT_RATIOS) + aspect_ratios = cfg.RETINANET.ASPECT_RATIOS + anchor_scale = cfg.RETINANET.ANCHOR_SCALE + + # get anchors from all levels for all scales/aspect ratios + foas = [] + for lvl in range(k_min, k_max + 1): + stride = 2. ** lvl + for octave in range(scales_per_octave): + octave_scale = 2 ** (octave / float(scales_per_octave)) + for idx in range(num_aspect_ratios): + anchor_sizes = (stride * octave_scale * anchor_scale, ) + anchor_aspect_ratios = (aspect_ratios[idx], ) + foa = data_utils.get_field_of_anchors( + stride, anchor_sizes, anchor_aspect_ratios, octave, idx) + foas.append(foa) + all_anchors = np.concatenate([f.field_of_anchors for f in foas]) + + blobs['retnet_fg_num'], blobs['retnet_bg_num'] = 0.0, 0.0 + for im_i, entry in enumerate(roidb): + scale = im_scales[im_i] + im_height = np.round(entry['height'] * scale) + im_width = np.round(entry['width'] * scale) + gt_inds = np.where( + (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] + assert len(gt_inds) > 0, \ + 'Empty ground truth empty for image is not allowed. Please check.' + + gt_rois = entry['boxes'][gt_inds, :] * scale + gt_classes = entry['gt_classes'][gt_inds] + + im_info = np.array([[im_height, im_width, scale]], dtype=np.float32) + blobs['im_info'].append(im_info) + + retinanet_blobs, fg_num, bg_num = _get_retinanet_blobs( + foas, all_anchors, gt_rois, gt_classes, image_width, image_height) + for i, foa in enumerate(foas): + for k, v in retinanet_blobs[i].items(): + # the way it stacks is: + # [[anchors for image1] + [anchors for images 2]] + level = int(np.log2(foa.stride)) + key = '{}_fpn{}'.format(k, level) + if k == 'retnet_roi_fg_bbox_locs': + v[:, 0] = im_i + # loc_stride: 80 * 4 if cls_specific else 4 + loc_stride = 4 # 4 coordinate corresponding to bbox prediction + if cfg.RETINANET.CLASS_SPECIFIC_BBOX: + loc_stride *= (cfg.MODEL.NUM_CLASSES - 1) + anchor_ind = foa.octave * num_aspect_ratios + foa.aspect + # v[:, 1] is the class label [range 0-80] if we do + # class-specfic bbox otherwise it is 0. In case of class + # specific, based on the label, the location of current + # anchor is class_label * 4 and then we take into account + # the anchor_ind if the anchors + v[:, 1] *= 4 + v[:, 1] += loc_stride * anchor_ind + blobs[key].append(v) + blobs['retnet_fg_num'] += fg_num + blobs['retnet_bg_num'] += bg_num + + blobs['retnet_fg_num'] = blobs['retnet_fg_num'].astype(np.float32) + blobs['retnet_bg_num'] = blobs['retnet_bg_num'].astype(np.float32) + + N = len(roidb) + for k, v in blobs.items(): + if isinstance(v, list) and len(v) > 0: + # compute number of anchors + A = int(len(v) / N) + # for the cls branch labels [per fpn level], + # we have blobs['retnet_cls_labels_fpn{}'] as a list until this step + # and length of this list is N x A where + # N = num_images, A = num_anchors for example, N = 2, A = 9 + # Each element of the list has the shape 1 x 1 x H x W where H, W are + # spatial dimension of curret fpn lvl. Let a{i} denote the element + # corresponding to anchor i [9 anchors total] in the list. + # The elements in the list are in order [[a0, ..., a9], [a0, ..., a9]] + # however the network will make predictions like 2 x (9 * 80) x H x W + # so we first concatenate the elements of each image to a numpy array + # and then concatenate the two images to get the 2 x 9 x H x W + + if k.find('retnet_cls_labels') >= 0: + tmp = [] + # concat anchors within an image + for i in range(0, len(v), A): + tmp.append(np.concatenate(v[i: i + A], axis=1)) + # concat images + blobs[k] = np.concatenate(tmp, axis=0) + else: + # for the bbox branch elements [per FPN level], + # we have the targets and the fg boxes locations + # in the shape: M x 4 where M is the number of fg locations in a + # given image at the current FPN level. For the given level, + # the bbox predictions will be. The elements in the list are in + # order [[a0, ..., a9], [a0, ..., a9]] + # Concatenate them to form M x 4 + blobs[k] = np.concatenate(v, axis=0) + return True + + +def _get_retinanet_blobs( + foas, all_anchors, gt_boxes, gt_classes, im_width, im_height): + total_anchors = all_anchors.shape[0] + logger.debug('Getting mad blobs: im_height {} im_width: {}'.format( + im_height, im_width)) + + inds_inside = np.arange(all_anchors.shape[0]) + anchors = all_anchors + num_inside = len(inds_inside) + + logger.debug('total_anchors: {}'.format(total_anchors)) + logger.debug('inds_inside: {}'.format(num_inside)) + logger.debug('anchors.shape: {}'.format(anchors.shape)) + + # Compute anchor labels: + # label=1 is positive, 0 is negative, -1 is don't care (ignore) + labels = np.empty((num_inside, ), dtype=np.float32) + labels.fill(-1) + if len(gt_boxes) > 0: + # Compute overlaps between the anchors and the gt boxes overlaps + anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes) + # Map from anchor to gt box that has highest overlap + anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1) + # For each anchor, amount of overlap with most overlapping gt box + anchor_to_gt_max = anchor_by_gt_overlap[ + np.arange(num_inside), anchor_to_gt_argmax] + + # Map from gt box to an anchor that has highest overlap + gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0) + # For each gt box, amount of overlap with most overlapping anchor + gt_to_anchor_max = anchor_by_gt_overlap[ + gt_to_anchor_argmax, np.arange(anchor_by_gt_overlap.shape[1])] + # Find all anchors that share the max overlap amount + # (this includes many ties) + anchors_with_max_overlap = np.where( + anchor_by_gt_overlap == gt_to_anchor_max)[0] + + # Fg label: for each gt use anchors with highest overlap + # (including ties) + gt_inds = anchor_to_gt_argmax[anchors_with_max_overlap] + labels[anchors_with_max_overlap] = gt_classes[gt_inds] + # Fg label: above threshold IOU + inds = anchor_to_gt_max >= cfg.RETINANET.POSITIVE_OVERLAP + gt_inds = anchor_to_gt_argmax[inds] + labels[inds] = gt_classes[gt_inds] + + fg_inds = np.where(labels >= 1)[0] + bg_inds = np.where(anchor_to_gt_max < cfg.RETINANET.NEGATIVE_OVERLAP)[0] + labels[bg_inds] = 0 + num_fg, num_bg = len(fg_inds), len(bg_inds) + + bbox_targets = np.zeros((num_inside, 4), dtype=np.float32) + bbox_targets[fg_inds, :] = data_utils.compute_targets( + anchors[fg_inds, :], gt_boxes[anchor_to_gt_argmax[fg_inds], :]) + + # Map up to original set of anchors + labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1) + bbox_targets = data_utils.unmap(bbox_targets, total_anchors, inds_inside, fill=0) + + # Split the generated labels, etc. into labels per each field of anchors + blobs_out = [] + start_idx = 0 + for foa in foas: + H = foa.field_size + W = foa.field_size + end_idx = start_idx + H * W + _labels = labels[start_idx:end_idx] + _bbox_targets = bbox_targets[start_idx:end_idx, :] + start_idx = end_idx + + # labels output with shape (1, height, width) + _labels = _labels.reshape((1, 1, H, W)) + # bbox_targets output with shape (1, 4 * A, height, width) + _bbox_targets = _bbox_targets.reshape((1, H, W, 4)).transpose(0, 3, 1, 2) + stride = foa.stride + w = int(im_width / stride) + h = int(im_height / stride) + + # data for select_smooth_l1 loss + num_classes = cfg.MODEL.NUM_CLASSES - 1 + inds_4d = np.where(_labels > 0) + M = len(inds_4d) + _roi_bbox_targets = np.zeros((0, 4)) + _roi_fg_bbox_locs = np.zeros((0, 4)) + if M > 0: + im_inds, y, x = inds_4d[0], inds_4d[2], inds_4d[3] + _roi_bbox_targets = np.zeros((len(im_inds), 4)) + _roi_fg_bbox_locs = np.zeros((len(im_inds), 4)) + lbls = _labels[im_inds, :, y, x] + for i, lbl in enumerate(lbls): + l = lbl[0] - 1 + if not cfg.RETINANET.CLASS_SPECIFIC_BBOX: + l = 0 + assert l >= 0 and l < num_classes, 'label out of the range' + _roi_bbox_targets[i, :] = _bbox_targets[:, :, y[i], x[i]] + _roi_fg_bbox_locs[i, :] = np.array([[0, l, y[i], x[i]]]) + blobs_out.append( + dict( + retnet_cls_labels=_labels[:, :, 0:h, 0:w].astype(np.int32), + retnet_roi_bbox_targets=_roi_bbox_targets.astype(np.float32), + retnet_roi_fg_bbox_locs=_roi_fg_bbox_locs.astype(np.float32), + )) + out_num_fg = np.array([num_fg + 1.0], dtype=np.float32) + out_num_bg = ( + np.array([num_bg + 1.0]) * (cfg.MODEL.NUM_CLASSES - 1) + + out_num_fg * (cfg.MODEL.NUM_CLASSES - 2)) + return blobs_out, out_num_fg, out_num_bg diff --git a/detectron/roi_data/rpn.py b/detectron/roi_data/rpn.py new file mode 100644 index 0000000000000000000000000000000000000000..6adb5a75b53a2c7a71dbd68fded7b52594512276 --- /dev/null +++ b/detectron/roi_data/rpn.py @@ -0,0 +1,280 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Minibatch construction for Region Proposal Networks (RPN).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import logging +import numpy as np +import numpy.random as npr + +from detectron.core.config import cfg +import detectron.roi_data.data_utils as data_utils +import detectron.utils.blob as blob_utils +import detectron.utils.boxes as box_utils + +logger = logging.getLogger(__name__) + + +def get_rpn_blob_names(is_training=True): + """Blob names used by RPN.""" + # im_info: (height, width, image scale) + blob_names = ['im_info'] + if is_training: + # gt boxes: (batch_idx, x1, y1, x2, y2, cls) + blob_names += ['roidb'] + if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN: + # Same format as RPN blobs, but one per FPN level + for lvl in range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1): + blob_names += [ + 'rpn_labels_int32_wide_fpn' + str(lvl), + 'rpn_bbox_targets_wide_fpn' + str(lvl), + 'rpn_bbox_inside_weights_wide_fpn' + str(lvl), + 'rpn_bbox_outside_weights_wide_fpn' + str(lvl) + ] + else: + # Single level RPN blobs + blob_names += [ + 'rpn_labels_int32_wide', + 'rpn_bbox_targets_wide', + 'rpn_bbox_inside_weights_wide', + 'rpn_bbox_outside_weights_wide' + ] + return blob_names + + +def add_rpn_blobs(blobs, im_scales, roidb): + """Add blobs needed training RPN-only and end-to-end Faster R-CNN models.""" + if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN: + # RPN applied to many feature levels, as in the FPN paper + k_max = cfg.FPN.RPN_MAX_LEVEL + k_min = cfg.FPN.RPN_MIN_LEVEL + foas = [] + for lvl in range(k_min, k_max + 1): + field_stride = 2.**lvl + anchor_sizes = (cfg.FPN.RPN_ANCHOR_START_SIZE * 2.**(lvl - k_min), ) + anchor_aspect_ratios = cfg.FPN.RPN_ASPECT_RATIOS + foa = data_utils.get_field_of_anchors( + field_stride, anchor_sizes, anchor_aspect_ratios + ) + foas.append(foa) + all_anchors = np.concatenate([f.field_of_anchors for f in foas]) + else: + foa = data_utils.get_field_of_anchors( + cfg.RPN.STRIDE, cfg.RPN.SIZES, cfg.RPN.ASPECT_RATIOS + ) + all_anchors = foa.field_of_anchors + + for im_i, entry in enumerate(roidb): + scale = im_scales[im_i] + im_height = np.round(entry['height'] * scale) + im_width = np.round(entry['width'] * scale) + gt_inds = np.where( + (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0) + )[0] + gt_rois = entry['boxes'][gt_inds, :] * scale + im_info = np.array([[im_height, im_width, scale]], dtype=np.float32) + blobs['im_info'].append(im_info) + + # Add RPN targets + if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN: + # RPN applied to many feature levels, as in the FPN paper + rpn_blobs = _get_rpn_blobs( + im_height, im_width, foas, all_anchors, gt_rois + ) + for i, lvl in enumerate(range(k_min, k_max + 1)): + for k, v in rpn_blobs[i].items(): + blobs[k + '_fpn' + str(lvl)].append(v) + else: + # Classical RPN, applied to a single feature level + rpn_blobs = _get_rpn_blobs( + im_height, im_width, [foa], all_anchors, gt_rois + ) + for k, v in rpn_blobs.items(): + blobs[k].append(v) + + for k, v in blobs.items(): + if isinstance(v, list) and len(v) > 0: + blobs[k] = np.concatenate(v) + + valid_keys = [ + 'has_visible_keypoints', 'boxes', 'segms', 'seg_areas', 'gt_classes', + 'gt_overlaps', 'is_crowd', 'box_to_gt_ind_map', 'gt_keypoints' + ] + minimal_roidb = [{} for _ in range(len(roidb))] + for i, e in enumerate(roidb): + for k in valid_keys: + if k in e: + minimal_roidb[i][k] = e[k] + blobs['roidb'] = blob_utils.serialize(minimal_roidb) + + # Always return valid=True, since RPN minibatches are valid by design + return True + + +def _get_rpn_blobs(im_height, im_width, foas, all_anchors, gt_boxes): + total_anchors = all_anchors.shape[0] + straddle_thresh = cfg.TRAIN.RPN_STRADDLE_THRESH + + if straddle_thresh >= 0: + # Only keep anchors inside the image by a margin of straddle_thresh + # Set TRAIN.RPN_STRADDLE_THRESH to -1 (or a large value) to keep all + # anchors + inds_inside = np.where( + (all_anchors[:, 0] >= -straddle_thresh) & + (all_anchors[:, 1] >= -straddle_thresh) & + (all_anchors[:, 2] < im_width + straddle_thresh) & + (all_anchors[:, 3] < im_height + straddle_thresh) + )[0] + # keep only inside anchors + anchors = all_anchors[inds_inside, :] + else: + inds_inside = np.arange(all_anchors.shape[0]) + anchors = all_anchors + num_inside = len(inds_inside) + + logger.debug('total_anchors: {}'.format(total_anchors)) + logger.debug('inds_inside: {}'.format(num_inside)) + logger.debug('anchors.shape: {}'.format(anchors.shape)) + + # Compute anchor labels: + # label=1 is positive, 0 is negative, -1 is don't care (ignore) + labels = np.empty((num_inside, ), dtype=np.int32) + labels.fill(-1) + if len(gt_boxes) > 0: + # Compute overlaps between the anchors and the gt boxes overlaps + anchor_by_gt_overlap = box_utils.bbox_overlaps(anchors, gt_boxes) + # Map from anchor to gt box that has highest overlap + anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1) + # For each anchor, amount of overlap with most overlapping gt box + anchor_to_gt_max = anchor_by_gt_overlap[np.arange(num_inside), + anchor_to_gt_argmax] + + # Map from gt box to an anchor that has highest overlap + gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0) + # For each gt box, amount of overlap with most overlapping anchor + gt_to_anchor_max = anchor_by_gt_overlap[ + gt_to_anchor_argmax, + np.arange(anchor_by_gt_overlap.shape[1]) + ] + # Find all anchors that share the max overlap amount + # (this includes many ties) + anchors_with_max_overlap = np.where( + anchor_by_gt_overlap == gt_to_anchor_max + )[0] + + # Fg label: for each gt use anchors with highest overlap + # (including ties) + labels[anchors_with_max_overlap] = 1 + # Fg label: above threshold IOU + labels[anchor_to_gt_max >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 + + # subsample positive labels if we have too many + num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCH_SIZE_PER_IM) + fg_inds = np.where(labels == 1)[0] + if len(fg_inds) > num_fg: + disable_inds = npr.choice( + fg_inds, size=(len(fg_inds) - num_fg), replace=False + ) + labels[disable_inds] = -1 + fg_inds = np.where(labels == 1)[0] + + # subsample negative labels if we have too many + # (samples with replacement, but since the set of bg inds is large most + # samples will not have repeats) + num_bg = cfg.TRAIN.RPN_BATCH_SIZE_PER_IM - np.sum(labels == 1) + bg_inds = np.where(anchor_to_gt_max < cfg.TRAIN.RPN_NEGATIVE_OVERLAP)[0] + if len(bg_inds) > num_bg: + enable_inds = bg_inds[npr.randint(len(bg_inds), size=num_bg)] + else: + enable_inds = bg_inds + + labels[enable_inds] = 0 + bg_inds = np.where(labels == 0)[0] + + bbox_targets = np.zeros((num_inside, 4), dtype=np.float32) + bbox_targets[fg_inds, :] = data_utils.compute_targets( + anchors[fg_inds, :], gt_boxes[anchor_to_gt_argmax[fg_inds], :] + ) + + # Bbox regression loss has the form: + # loss(x) = weight_outside * L(weight_inside * x) + # Inside weights allow us to set zero loss on an element-wise basis + # Bbox regression is only trained on positive examples so we set their + # weights to 1.0 (or otherwise if config is different) and 0 otherwise + bbox_inside_weights = np.zeros((num_inside, 4), dtype=np.float32) + bbox_inside_weights[labels == 1, :] = (1.0, 1.0, 1.0, 1.0) + + # The bbox regression loss only averages by the number of images in the + # mini-batch, whereas we need to average by the total number of example + # anchors selected + # Outside weights are used to scale each element-wise loss so the final + # average over the mini-batch is correct + bbox_outside_weights = np.zeros((num_inside, 4), dtype=np.float32) + # uniform weighting of examples (given non-uniform sampling) + num_examples = np.sum(labels >= 0) + bbox_outside_weights[labels == 1, :] = 1.0 / num_examples + bbox_outside_weights[labels == 0, :] = 1.0 / num_examples + + # Map up to original set of anchors + labels = data_utils.unmap(labels, total_anchors, inds_inside, fill=-1) + bbox_targets = data_utils.unmap( + bbox_targets, total_anchors, inds_inside, fill=0 + ) + bbox_inside_weights = data_utils.unmap( + bbox_inside_weights, total_anchors, inds_inside, fill=0 + ) + bbox_outside_weights = data_utils.unmap( + bbox_outside_weights, total_anchors, inds_inside, fill=0 + ) + + # Split the generated labels, etc. into labels per each field of anchors + blobs_out = [] + start_idx = 0 + for foa in foas: + H = foa.field_size + W = foa.field_size + A = foa.num_cell_anchors + end_idx = start_idx + H * W * A + _labels = labels[start_idx:end_idx] + _bbox_targets = bbox_targets[start_idx:end_idx, :] + _bbox_inside_weights = bbox_inside_weights[start_idx:end_idx, :] + _bbox_outside_weights = bbox_outside_weights[start_idx:end_idx, :] + start_idx = end_idx + + # labels output with shape (1, A, height, width) + _labels = _labels.reshape((1, H, W, A)).transpose(0, 3, 1, 2) + # bbox_targets output with shape (1, 4 * A, height, width) + _bbox_targets = _bbox_targets.reshape( + (1, H, W, A * 4)).transpose(0, 3, 1, 2) + # bbox_inside_weights output with shape (1, 4 * A, height, width) + _bbox_inside_weights = _bbox_inside_weights.reshape( + (1, H, W, A * 4)).transpose(0, 3, 1, 2) + # bbox_outside_weights output with shape (1, 4 * A, height, width) + _bbox_outside_weights = _bbox_outside_weights.reshape( + (1, H, W, A * 4)).transpose(0, 3, 1, 2) + blobs_out.append( + dict( + rpn_labels_int32_wide=_labels, + rpn_bbox_targets_wide=_bbox_targets, + rpn_bbox_inside_weights_wide=_bbox_inside_weights, + rpn_bbox_outside_weights_wide=_bbox_outside_weights + ) + ) + return blobs_out[0] if len(blobs_out) == 1 else blobs_out diff --git a/detectron/tests/data_loader_benchmark.py b/detectron/tests/data_loader_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..b147a88851e5ffd8b15b6a49a945c1b31d119992 --- /dev/null +++ b/detectron/tests/data_loader_benchmark.py @@ -0,0 +1,167 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# Example usage: +# data_loader_benchmark.par \ +# NUM_GPUS 2 \ +# TRAIN.DATASETS "('voc_2007_trainval',)" \ +# TRAIN.PROPOSAL_FILES /path/to/voc_2007_trainval/proposals.pkl \ +# DATA_LOADER.NUM_THREADS 4 \ +# DATA_LOADER.MINIBATCH_QUEUE_SIZE 64 \ +# DATA_LOADER.BLOBS_QUEUE_CAPACITY 8 + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import logging +import numpy as np +import pprint +import sys +import time + +from caffe2.python import core +from caffe2.python import muji +from caffe2.python import workspace + +from detectron.core.config import assert_and_infer_cfg +from detectron.core.config import cfg +from detectron.core.config import merge_cfg_from_file +from detectron.core.config import merge_cfg_from_list +from detectron.datasets.roidb import combined_roidb_for_training +from detectron.roi_data.loader import RoIDataLoader +from detectron.utils.logging import setup_logging +from detectron.utils.timer import Timer + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--num-batches', dest='num_batches', + help='Number of minibatches to run', + default=200, type=int) + parser.add_argument( + '--sleep', dest='sleep_time', + help='Seconds sleep to emulate a network running', + default=0.1, type=float) + parser.add_argument( + '--cfg', dest='cfg_file', help='optional config file', default=None, + type=str) + parser.add_argument( + '--x-factor', dest='x_factor', help='simulates x-factor more GPUs', + default=1, type=int) + parser.add_argument( + '--profiler', dest='profiler', help='profile minibatch load time', + action='store_true') + parser.add_argument( + 'opts', help='See detectron/core/config.py for all options', default=None, + nargs=argparse.REMAINDER) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + args = parser.parse_args() + return args + + +def loader_loop(roi_data_loader): + load_timer = Timer() + iters = 100 + for i in range(iters): + load_timer.tic() + roi_data_loader.get_next_minibatch() + load_timer.toc() + print('{:d}/{:d}: Average get_next_minibatch time: {:.3f}s'.format( + i + 1, iters, load_timer.average_time)) + + +def main(opts): + logger = logging.getLogger(__name__) + roidb = combined_roidb_for_training( + cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) + logger.info('{:d} roidb entries'.format(len(roidb))) + roi_data_loader = RoIDataLoader( + roidb, + num_loaders=cfg.DATA_LOADER.NUM_THREADS, + minibatch_queue_size=cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE, + blobs_queue_capacity=cfg.DATA_LOADER.BLOBS_QUEUE_CAPACITY + ) + blob_names = roi_data_loader.get_output_names() + + net = core.Net('dequeue_net') + net.type = 'dag' + all_blobs = [] + for gpu_id in range(cfg.NUM_GPUS): + with core.NameScope('gpu_{}'.format(gpu_id)): + with core.DeviceScope(muji.OnGPU(gpu_id)): + for blob_name in blob_names: + blob = core.ScopedName(blob_name) + all_blobs.append(blob) + workspace.CreateBlob(blob) + logger.info('Creating blob: {}'.format(blob)) + net.DequeueBlobs( + roi_data_loader._blobs_queue_name, blob_names) + logger.info("Protobuf:\n" + str(net.Proto())) + + if opts.profiler: + import cProfile + cProfile.runctx( + 'loader_loop(roi_data_loader)', globals(), locals(), + sort='cumulative') + else: + loader_loop(roi_data_loader) + + roi_data_loader.register_sigint_handler() + roi_data_loader.start(prefill=True) + total_time = 0 + for i in range(opts.num_batches): + start_t = time.time() + for _ in range(opts.x_factor): + workspace.RunNetOnce(net) + total_time += (time.time() - start_t) / opts.x_factor + logger.info( + '{:d}/{:d}: Averge dequeue time: {:.3f}s [{:d}/{:d}]'.format( + i + 1, opts.num_batches, total_time / (i + 1), + roi_data_loader._minibatch_queue.qsize(), + cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE + ) + ) + # Sleep to simulate the time taken by running a little network + time.sleep(opts.sleep_time) + # To inspect: + # blobs = workspace.FetchBlobs(all_blobs) + # from IPython import embed; embed() + logger.info('Shutting down data loader...') + roi_data_loader.shutdown() + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + logger = setup_logging(__name__) + logger.setLevel(logging.DEBUG) + logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO) + np.random.seed(cfg.RNG_SEED) + args = parse_args() + logger.info('Called with args:') + logger.info(args) + if args.cfg_file is not None: + merge_cfg_from_file(args.cfg_file) + if args.opts is not None: + merge_cfg_from_list(args.opts) + assert_and_infer_cfg() + logger.info('Running with config:') + logger.info(pprint.pformat(cfg)) + main(args) diff --git a/detectron/tests/test_batch_permutation_op.py b/detectron/tests/test_batch_permutation_op.py new file mode 100644 index 0000000000000000000000000000000000000000..28aa84f7490ea1b4fe5c6de7ec96af217a03a342 --- /dev/null +++ b/detectron/tests/test_batch_permutation_op.py @@ -0,0 +1,111 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import unittest + +from caffe2.proto import caffe2_pb2 +from caffe2.python import core +from caffe2.python import gradient_checker +from caffe2.python import workspace + +import detectron.utils.logging as logging_utils +import detectron.utils.c2 as c2_utils + + +class BatchPermutationOpTest(unittest.TestCase): + def _run_op_test(self, X, I, check_grad=False): + with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): + op = core.CreateOperator('BatchPermutation', ['X', 'I'], ['Y']) + workspace.FeedBlob('X', X) + workspace.FeedBlob('I', I) + workspace.RunOperatorOnce(op) + Y = workspace.FetchBlob('Y') + + if check_grad: + gc = gradient_checker.GradientChecker( + stepsize=0.1, + threshold=0.001, + device_option=core.DeviceOption(caffe2_pb2.CUDA, 0) + ) + + res, grad, grad_estimated = gc.CheckSimple(op, [X, I], 0, [0]) + self.assertTrue(res, 'Grad check failed') + + Y_ref = X[I] + np.testing.assert_allclose(Y, Y_ref, rtol=1e-5, atol=1e-08) + + def _run_speed_test(self, iters=5, N=1024): + """This function provides an example of how to benchmark custom + operators using the Caffe2 'prof_dag' network execution type. Please + note that for 'prof_dag' to work, Caffe2 must be compiled with profiling + support using the `-DUSE_PROF=ON` option passed to `cmake` when building + Caffe2. + """ + net = core.Net('test') + net.Proto().type = 'prof_dag' + net.Proto().num_workers = 2 + Y = net.BatchPermutation(['X', 'I'], 'Y') + Y_flat = net.FlattenToVec([Y], 'Y_flat') + loss = net.AveragedLoss([Y_flat], 'loss') + net.AddGradientOperators([loss]) + workspace.CreateNet(net) + + X = np.random.randn(N, 256, 14, 14) + for _i in range(iters): + I = np.random.permutation(N) + workspace.FeedBlob('X', X.astype(np.float32)) + workspace.FeedBlob('I', I.astype(np.int32)) + workspace.RunNet(net.Proto().name) + np.testing.assert_allclose( + workspace.FetchBlob('Y'), X[I], rtol=1e-5, atol=1e-08 + ) + + def test_forward_and_gradient(self): + A = np.random.randn(2, 3, 5, 7).astype(np.float32) + I = np.array([0, 1], dtype=np.int32) + self._run_op_test(A, I, check_grad=True) + + A = np.random.randn(2, 3, 5, 7).astype(np.float32) + I = np.array([1, 0], dtype=np.int32) + self._run_op_test(A, I, check_grad=True) + + A = np.random.randn(10, 3, 5, 7).astype(np.float32) + I = np.array(np.random.permutation(10), dtype=np.int32) + self._run_op_test(A, I, check_grad=True) + + def test_size_exceptions(self): + A = np.random.randn(2, 256, 42, 86).astype(np.float32) + I = np.array(np.random.permutation(10), dtype=np.int32) + with self.assertRaises(RuntimeError): + self._run_op_test(A, I) + + # See doc string in _run_speed_test + # def test_perf(self): + # with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): + # self._run_speed_test() + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + c2_utils.import_detectron_ops() + assert 'BatchPermutation' in workspace.RegisteredOperators() + logging_utils.setup_logging(__name__) + unittest.main() diff --git a/detectron/tests/test_bbox_transform.py b/detectron/tests/test_bbox_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..7d204f625ccc49c19e6e46de7e4ab4e227769df5 --- /dev/null +++ b/detectron/tests/test_bbox_transform.py @@ -0,0 +1,107 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import unittest + +from pycocotools import mask as COCOmask + +import detectron.utils.boxes as box_utils + + +def random_boxes(mean_box, stdev, N): + boxes = np.random.randn(N, 4) * stdev + mean_box + return boxes.astype(dtype=np.float32) + + +class TestBboxTransform(unittest.TestCase): + def test_bbox_transform_and_inverse(self): + weights = (5, 5, 10, 10) + src_boxes = random_boxes([10, 10, 20, 20], 1, 10) + dst_boxes = random_boxes([10, 10, 20, 20], 1, 10) + deltas = box_utils.bbox_transform_inv( + src_boxes, dst_boxes, weights=weights + ) + dst_boxes_reconstructed = box_utils.bbox_transform( + src_boxes, deltas, weights=weights + ) + np.testing.assert_array_almost_equal( + dst_boxes, dst_boxes_reconstructed, decimal=5 + ) + + def test_bbox_dataset_to_prediction_roundtrip(self): + """Simulate the process of reading a ground-truth box from a dataset, + make predictions from proposals, convert the predictions back to the + dataset format, and then use the COCO API to compute IoU overlap between + the gt box and the predictions. These should have IoU of 1. + """ + weights = (5, 5, 10, 10) + # 1/ "read" a box from a dataset in the default (x1, y1, w, h) format + gt_xywh_box = [10, 20, 100, 150] + # 2/ convert it to our internal (x1, y1, x2, y2) format + gt_xyxy_box = box_utils.xywh_to_xyxy(gt_xywh_box) + # 3/ consider nearby proposal boxes + prop_xyxy_boxes = random_boxes(gt_xyxy_box, 10, 10) + # 4/ compute proposal-to-gt transformation deltas + deltas = box_utils.bbox_transform_inv( + prop_xyxy_boxes, np.array([gt_xyxy_box]), weights=weights + ) + # 5/ use deltas to transform proposals to xyxy predicted box + pred_xyxy_boxes = box_utils.bbox_transform( + prop_xyxy_boxes, deltas, weights=weights + ) + # 6/ convert xyxy predicted box to xywh predicted box + pred_xywh_boxes = box_utils.xyxy_to_xywh(pred_xyxy_boxes) + # 7/ use COCO API to compute IoU + not_crowd = [int(False)] * pred_xywh_boxes.shape[0] + ious = COCOmask.iou(pred_xywh_boxes, np.array([gt_xywh_box]), not_crowd) + np.testing.assert_array_almost_equal(ious, np.ones(ious.shape)) + + def test_cython_bbox_iou_against_coco_api_bbox_iou(self): + """Check that our cython implementation of bounding box IoU overlap + matches the COCO API implementation. + """ + def _do_test(b1, b2): + # Compute IoU overlap with the cython implementation + cython_iou = box_utils.bbox_overlaps(b1, b2) + # Compute IoU overlap with the COCO API implementation + # (requires converting boxes from xyxy to xywh format) + xywh_b1 = box_utils.xyxy_to_xywh(b1) + xywh_b2 = box_utils.xyxy_to_xywh(b2) + not_crowd = [int(False)] * b2.shape[0] + coco_ious = COCOmask.iou(xywh_b1, xywh_b2, not_crowd) + # IoUs should be similar + np.testing.assert_array_almost_equal( + cython_iou, coco_ious, decimal=5 + ) + + # Test small boxes + b1 = random_boxes([10, 10, 20, 20], 5, 10) + b2 = random_boxes([10, 10, 20, 20], 5, 10) + _do_test(b1, b2) + + # Test bigger boxes + b1 = random_boxes([10, 10, 110, 20], 20, 10) + b2 = random_boxes([10, 10, 110, 20], 20, 10) + _do_test(b1, b2) + + +if __name__ == '__main__': + unittest.main() diff --git a/detectron/tests/test_cfg.py b/detectron/tests/test_cfg.py new file mode 100644 index 0000000000000000000000000000000000000000..4f3bee20fdaae99b1957e7e7d54ecc3876bf04c2 --- /dev/null +++ b/detectron/tests/test_cfg.py @@ -0,0 +1,199 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy +import tempfile +import unittest + +from detectron.core.config import cfg +from detectron.utils.collections import AttrDict +import detectron.core.config as core_config +import detectron.utils.env as envu +import detectron.utils.logging as logging_utils + + +class TestAttrDict(unittest.TestCase): + def test_immutability(self): + # Top level immutable + a = AttrDict() + a.foo = 0 + a.immutable(True) + with self.assertRaises(AttributeError): + a.foo = 1 + a.bar = 1 + assert a.is_immutable() + assert a.foo == 0 + a.immutable(False) + assert not a.is_immutable() + a.foo = 1 + assert a.foo == 1 + + # Recursively immutable + a.level1 = AttrDict() + a.level1.foo = 0 + a.level1.level2 = AttrDict() + a.level1.level2.foo = 0 + a.immutable(True) + assert a.is_immutable() + with self.assertRaises(AttributeError): + a.level1.level2.foo = 1 + a.level1.bar = 1 + assert a.level1.level2.foo == 0 + + # Serialize immutability state + a.immutable(True) + a2 = core_config.load_cfg(envu.yaml_dump(a)) + assert a.is_immutable() + assert a2.is_immutable() + + +class TestCfg(unittest.TestCase): + def test_copy_cfg(self): + cfg2 = copy.deepcopy(cfg) + s = cfg.MODEL.TYPE + cfg2.MODEL.TYPE = 'dummy' + assert cfg.MODEL.TYPE == s + + def test_merge_cfg_from_cfg(self): + # Test: merge from deepcopy + s = 'dummy0' + cfg2 = copy.deepcopy(cfg) + cfg2.MODEL.TYPE = s + core_config.merge_cfg_from_cfg(cfg2) + assert cfg.MODEL.TYPE == s + + # Test: merge from yaml + s = 'dummy1' + cfg2 = core_config.load_cfg(envu.yaml_dump(cfg)) + cfg2.MODEL.TYPE = s + core_config.merge_cfg_from_cfg(cfg2) + assert cfg.MODEL.TYPE == s + + # Test: merge with a valid key + s = 'dummy2' + cfg2 = AttrDict() + cfg2.MODEL = AttrDict() + cfg2.MODEL.TYPE = s + core_config.merge_cfg_from_cfg(cfg2) + assert cfg.MODEL.TYPE == s + + # Test: merge with an invalid key + s = 'dummy3' + cfg2 = AttrDict() + cfg2.FOO = AttrDict() + cfg2.FOO.BAR = s + with self.assertRaises(KeyError): + core_config.merge_cfg_from_cfg(cfg2) + + # Test: merge with converted type + cfg2 = AttrDict() + cfg2.TRAIN = AttrDict() + cfg2.TRAIN.SCALES = [1] + core_config.merge_cfg_from_cfg(cfg2) + assert type(cfg.TRAIN.SCALES) is tuple + assert cfg.TRAIN.SCALES[0] == 1 + + # Test: merge with invalid type + cfg2 = AttrDict() + cfg2.TRAIN = AttrDict() + cfg2.TRAIN.SCALES = 1 + with self.assertRaises(ValueError): + core_config.merge_cfg_from_cfg(cfg2) + + def test_merge_cfg_from_file(self): + with tempfile.NamedTemporaryFile() as f: + envu.yaml_dump(cfg, f) + s = cfg.MODEL.TYPE + cfg.MODEL.TYPE = 'dummy' + assert cfg.MODEL.TYPE != s + core_config.merge_cfg_from_file(f.name) + assert cfg.MODEL.TYPE == s + + def test_merge_cfg_from_list(self): + opts = [ + 'TRAIN.SCALES', '(100, )', 'MODEL.TYPE', u'foobar', 'NUM_GPUS', 2 + ] + assert len(cfg.TRAIN.SCALES) > 0 + assert cfg.TRAIN.SCALES[0] != 100 + assert cfg.MODEL.TYPE != 'foobar' + assert cfg.NUM_GPUS != 2 + core_config.merge_cfg_from_list(opts) + assert type(cfg.TRAIN.SCALES) is tuple + assert len(cfg.TRAIN.SCALES) == 1 + assert cfg.TRAIN.SCALES[0] == 100 + assert cfg.MODEL.TYPE == 'foobar' + assert cfg.NUM_GPUS == 2 + + def test_deprecated_key_from_list(self): + # You should see logger messages like: + # "Deprecated config key (ignoring): MODEL.DILATION" + opts = ['FINAL_MSG', 'foobar', 'MODEL.DILATION', 2] + with self.assertRaises(AttributeError): + _ = cfg.FINAL_MSG # noqa + with self.assertRaises(AttributeError): + _ = cfg.MODEL.DILATION # noqa + core_config.merge_cfg_from_list(opts) + with self.assertRaises(AttributeError): + _ = cfg.FINAL_MSG # noqa + with self.assertRaises(AttributeError): + _ = cfg.MODEL.DILATION # noqa + + def test_deprecated_key_from_file(self): + # You should see logger messages like: + # "Deprecated config key (ignoring): MODEL.DILATION" + with tempfile.NamedTemporaryFile() as f: + cfg2 = copy.deepcopy(cfg) + cfg2.MODEL.DILATION = 2 + envu.yaml_dump(cfg2, f) + with self.assertRaises(AttributeError): + _ = cfg.MODEL.DILATION # noqa + core_config.merge_cfg_from_file(f.name) + with self.assertRaises(AttributeError): + _ = cfg.MODEL.DILATION # noqa + + def test_renamed_key_from_list(self): + # You should see logger messages like: + # "Key EXAMPLE.RENAMED.KEY was renamed to EXAMPLE.KEY; + # please update your config" + opts = ['EXAMPLE.RENAMED.KEY', 'foobar'] + with self.assertRaises(AttributeError): + _ = cfg.EXAMPLE.RENAMED.KEY # noqa + with self.assertRaises(KeyError): + core_config.merge_cfg_from_list(opts) + + def test_renamed_key_from_file(self): + # You should see logger messages like: + # "Key EXAMPLE.RENAMED.KEY was renamed to EXAMPLE.KEY; + # please update your config" + with tempfile.NamedTemporaryFile() as f: + cfg2 = copy.deepcopy(cfg) + cfg2.EXAMPLE = AttrDict() + cfg2.EXAMPLE.RENAMED = AttrDict() + cfg2.EXAMPLE.RENAMED.KEY = 'foobar' + envu.yaml_dump(cfg2, f) + with self.assertRaises(AttributeError): + _ = cfg.EXAMPLE.RENAMED.KEY # noqa + with self.assertRaises(KeyError): + core_config.merge_cfg_from_file(f.name) + + +if __name__ == '__main__': + logging_utils.setup_logging(__name__) + unittest.main() diff --git a/detectron/tests/test_loader.py b/detectron/tests/test_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..ac3a94c2dc04aa5a717a3b917bf86a1c6ac3fea6 --- /dev/null +++ b/detectron/tests/test_loader.py @@ -0,0 +1,121 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import logging +import unittest +import mock + +from caffe2.proto import caffe2_pb2 +from caffe2.python import core +from caffe2.python import muji +from caffe2.python import workspace + +from detectron.core.config import assert_and_infer_cfg +from detectron.core.config import cfg +from detectron.roi_data.loader import RoIDataLoader +import detectron.utils.logging as logging_utils + + +def get_roidb_blobs(roidb): + blobs = {} + blobs['data'] = np.stack([entry['data'] for entry in roidb]) + return blobs, True + + +def get_net(data_loader, name): + logger = logging.getLogger(__name__) + blob_names = data_loader.get_output_names() + net = core.Net(name) + net.type = 'dag' + for gpu_id in range(cfg.NUM_GPUS): + with core.NameScope('gpu_{}'.format(gpu_id)): + with core.DeviceScope(muji.OnGPU(gpu_id)): + for blob_name in blob_names: + blob = core.ScopedName(blob_name) + workspace.CreateBlob(blob) + net.DequeueBlobs( + data_loader._blobs_queue_name, blob_names) + logger.info("Protobuf:\n" + str(net.Proto())) + + return net + + +def get_roidb_sample_data(sample_data): + roidb = [] + for _ in range(np.random.randint(4, 10)): + roidb.append({'data': sample_data}) + return roidb + + +def create_loader_and_network(sample_data, name): + roidb = get_roidb_sample_data(sample_data) + loader = RoIDataLoader(roidb) + net = get_net(loader, 'dequeue_net_train') + loader.register_sigint_handler() + loader.start(prefill=False) + return loader, net + + +def run_net(net): + workspace.RunNetOnce(net) + gpu_dev = core.DeviceOption(caffe2_pb2.CUDA, 0) + name_scope = 'gpu_{}'.format(0) + with core.NameScope(name_scope): + with core.DeviceScope(gpu_dev): + data = workspace.FetchBlob(core.ScopedName('data')) + return data + + +class TestRoIDataLoader(unittest.TestCase): + @mock.patch( + 'detectron.roi_data.loader.get_minibatch_blob_names', + return_value=[u'data'] + ) + @mock.patch( + 'detectron.roi_data.loader.get_minibatch', + side_effect=get_roidb_blobs + ) + def test_two_parallel_loaders(self, _1, _2): + train_data = np.random.rand(2, 3, 3).astype(np.float32) + train_loader, train_net = create_loader_and_network(train_data, + 'dequeue_net_train') + test_data = np.random.rand(2, 4, 4).astype(np.float32) + test_loader, test_net = create_loader_and_network(test_data, + 'dequeue_net_test') + for _ in range(5): + data = run_net(train_net) + self.assertEqual(data[0].tolist(), train_data.tolist()) + data = run_net(test_net) + self.assertEqual(data[0].tolist(), test_data.tolist()) + test_loader.shutdown() + train_loader.shutdown() + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + logger = logging_utils.setup_logging(__name__) + logger.setLevel(logging.DEBUG) + logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO) + np.random.seed(cfg.RNG_SEED) + cfg.TRAIN.ASPECT_GROUPING = False + cfg.NUM_GPUS = 2 + assert_and_infer_cfg() + unittest.main() diff --git a/detectron/tests/test_restore_checkpoint.py b/detectron/tests/test_restore_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..2694eee95c906565f6bb5ff861329e3b2392b089 --- /dev/null +++ b/detectron/tests/test_restore_checkpoint.py @@ -0,0 +1,132 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import logging +import numpy as np +import os +import shutil +import tempfile + +from caffe2.python import workspace + +from detectron.core.config import assert_and_infer_cfg +from detectron.core.config import cfg +from detectron.core.config import get_output_dir +from detectron.datasets.roidb import combined_roidb_for_training +from detectron.modeling import model_builder +from detectron.utils.logging import setup_logging +import detectron.utils.c2 as c2_utils +import detectron.utils.net as nu + +c2_utils.import_detectron_ops() + + +def get_params(model): + blobs = {} # gpu_0 blobs with unscoped_name as key + all_blobs = {} # all blobs with scoped name as key + # Save all parameters + for param in model.params: + scoped_name = str(param) + unscoped_name = c2_utils.UnscopeName(scoped_name) + if 'gpu_0' in scoped_name: + blobs[unscoped_name] = workspace.FetchBlob(scoped_name) + all_blobs[scoped_name] = workspace.FetchBlob(scoped_name) + for param in model.TrainableParams(): + scoped_name = str(param) + '_momentum' + unscoped_name = c2_utils.UnscopeName(scoped_name) + if 'gpu_0' in scoped_name: + blobs[unscoped_name] = workspace.FetchBlob(scoped_name) + all_blobs[scoped_name] = workspace.FetchBlob(scoped_name) + return blobs, all_blobs + + +def add_momentum_init_ops(model): + for param in model.TrainableParams(gpu_id=0): + model.param_init_net.GaussianFill( + [param + '_momentum'], param + '_momentum', mean=0.0, std=1.0) + + +def init_weights(model): + # init weights in gpu_id = 0 and then broadcast + workspace.RunNetOnce(model.param_init_net) + nu.broadcast_parameters(model) + + +def test_restore_checkpoint(): + # Create Model + model = model_builder.create(cfg.MODEL.TYPE, train=True) + add_momentum_init_ops(model) + init_weights(model) + # Fill input blobs + roidb = combined_roidb_for_training( + cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES + ) + model_builder.add_training_inputs(model, roidb=roidb) + workspace.CreateNet(model.net) + # Bookkeeping for checkpoint creation + iter_num = 0 + checkpoints = {} + output_dir = get_output_dir(cfg.TRAIN.DATASETS, training=True) + chk_file_path = os.path.join(output_dir, 'model_iter{}.pkl'.format(iter_num)) + checkpoints[iter_num] = chk_file_path + # Save model weights + nu.save_model_to_weights_file(checkpoints[iter_num], model) + orig_gpu_0_params, orig_all_params = get_params(model) + # Change the model weights + init_weights(model) + # Reload the weights in the model + nu.initialize_gpu_from_weights_file(model, chk_file_path, gpu_id=0) + nu.broadcast_parameters(model) + shutil.rmtree(cfg.OUTPUT_DIR) + _, restored_all_params = get_params(model) + # Check if all params are loaded correctly + for scoped_name, blob in orig_all_params.items(): + np.testing.assert_array_equal(blob, restored_all_params[scoped_name]) + # Check if broadcast_parameters works + for scoped_name, blob in restored_all_params.items(): + unscoped_name = c2_utils.UnscopeName(scoped_name) + np.testing.assert_array_equal(blob, orig_gpu_0_params[unscoped_name]) + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + logger = setup_logging(__name__) + logger.setLevel(logging.DEBUG) + logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO) + np.random.seed(cfg.RNG_SEED) + output_dir = tempfile.mkdtemp() + # Generate config for test + cfg.MODEL.TYPE = 'generalized_rcnn' + cfg.MODEL.CONV_BODY = 'FPN.add_fpn_ResNet50_conv5_body' + cfg.MODEL.NUM_CLASSES = 81 + cfg.MODEL.FASTER_RCNN = True + cfg.FPN.FPN_ON = True + cfg.FPN.MULTILEVEL_ROIS = True + cfg.FPN.MULTILEVEL_RPN = True + cfg.FAST_RCNN.ROI_BOX_HEAD = 'fast_rcnn_heads.add_roi_2mlp_head' + cfg.FAST_RCNN.ROI_XFORM_METHOD = 'RoIAlign' + cfg.OUTPUT_DIR = output_dir + cfg.TRAIN.DATASETS = ('coco_2014_minival',) + cfg.TRAIN.WEIGHTS = b'' + for num_gpu in range(workspace.NumCudaDevices()): + cfg.immutable(False) + cfg.NUM_GPUS = num_gpu + 1 + assert_and_infer_cfg() + test_restore_checkpoint() diff --git a/detectron/tests/test_smooth_l1_loss_op.py b/detectron/tests/test_smooth_l1_loss_op.py new file mode 100644 index 0000000000000000000000000000000000000000..bd0bdb1af1f0b891473e28696ee712f082046c55 --- /dev/null +++ b/detectron/tests/test_smooth_l1_loss_op.py @@ -0,0 +1,80 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import unittest + +from caffe2.proto import caffe2_pb2 +from caffe2.python import core +from caffe2.python import gradient_checker +from caffe2.python import workspace + +import detectron.utils.c2 as c2_utils +import detectron.utils.logging as logging_utils + + +class SmoothL1LossTest(unittest.TestCase): + def test_forward_and_gradient(self): + Y = np.random.randn(128, 4 * 21).astype(np.float32) + Y_hat = np.random.randn(128, 4 * 21).astype(np.float32) + inside_weights = np.random.randn(128, 4 * 21).astype(np.float32) + inside_weights[inside_weights < 0] = 0 + outside_weights = np.random.randn(128, 4 * 21).astype(np.float32) + outside_weights[outside_weights < 0] = 0 + scale = np.random.random() + beta = np.random.random() + + op = core.CreateOperator( + 'SmoothL1Loss', ['Y_hat', 'Y', 'inside_weights', 'outside_weights'], + ['loss'], + scale=scale, + beta=beta + ) + + gc = gradient_checker.GradientChecker( + stepsize=0.005, + threshold=0.005, + device_option=core.DeviceOption(caffe2_pb2.CUDA, 0) + ) + + res, grad, grad_estimated = gc.CheckSimple( + op, [Y_hat, Y, inside_weights, outside_weights], 0, [0] + ) + + self.assertTrue( + grad.shape == grad_estimated.shape, + 'Fail check: grad.shape != grad_estimated.shape' + ) + + # To inspect the gradient and estimated gradient: + # np.set_printoptions(precision=3, suppress=True) + # print('grad:') + # print(grad) + # print('grad_estimated:') + # print(grad_estimated) + + self.assertTrue(res) + + +if __name__ == '__main__': + c2_utils.import_detectron_ops() + assert 'SmoothL1Loss' in workspace.RegisteredOperators() + logging_utils.setup_logging(__name__) + unittest.main() diff --git a/detectron/tests/test_spatial_narrow_as_op.py b/detectron/tests/test_spatial_narrow_as_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ca101aa9b2715d66f36b7847ed1ed8f83f13c872 --- /dev/null +++ b/detectron/tests/test_spatial_narrow_as_op.py @@ -0,0 +1,91 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import unittest + +from caffe2.proto import caffe2_pb2 +from caffe2.python import core +from caffe2.python import gradient_checker +from caffe2.python import workspace + +import detectron.utils.c2 as c2_utils +import detectron.utils.logging as logging_utils + + +class SpatialNarrowAsOpTest(unittest.TestCase): + def _run_test(self, A, B, check_grad=False): + with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): + op = core.CreateOperator('SpatialNarrowAs', ['A', 'B'], ['C']) + workspace.FeedBlob('A', A) + workspace.FeedBlob('B', B) + workspace.RunOperatorOnce(op) + C = workspace.FetchBlob('C') + + if check_grad: + gc = gradient_checker.GradientChecker( + stepsize=0.005, + threshold=0.005, + device_option=core.DeviceOption(caffe2_pb2.CUDA, 0) + ) + + res, grad, grad_estimated = gc.CheckSimple(op, [A, B], 0, [0]) + self.assertTrue(res, 'Grad check failed') + + dims = C.shape + C_ref = A[:dims[0], :dims[1], :dims[2], :dims[3]] + np.testing.assert_allclose(C, C_ref, rtol=1e-5, atol=1e-08) + + def test_small_forward_and_gradient(self): + A = np.random.randn(2, 3, 5, 7).astype(np.float32) + B = np.random.randn(2, 3, 2, 2).astype(np.float32) + self._run_test(A, B, check_grad=True) + + A = np.random.randn(2, 3, 5, 7).astype(np.float32) + B = np.random.randn(2, 3, 5).astype(np.float32) + self._run_test(A, B, check_grad=True) + + def test_large_forward(self): + A = np.random.randn(2, 256, 42, 100).astype(np.float32) + B = np.random.randn(2, 256, 35, 87).astype(np.float32) + self._run_test(A, B) + + A = np.random.randn(2, 256, 42, 87).astype(np.float32) + B = np.random.randn(2, 256, 35, 87).astype(np.float32) + self._run_test(A, B) + + def test_size_exceptions(self): + A = np.random.randn(2, 256, 42, 86).astype(np.float32) + B = np.random.randn(2, 256, 35, 87).astype(np.float32) + with self.assertRaises(RuntimeError): + self._run_test(A, B) + + A = np.random.randn(2, 255, 42, 88).astype(np.float32) + B = np.random.randn(2, 256, 35, 87).astype(np.float32) + with self.assertRaises(RuntimeError): + self._run_test(A, B) + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + c2_utils.import_detectron_ops() + assert 'SpatialNarrowAs' in workspace.RegisteredOperators() + logging_utils.setup_logging(__name__) + unittest.main() diff --git a/detectron/tests/test_zero_even_op.py b/detectron/tests/test_zero_even_op.py new file mode 100644 index 0000000000000000000000000000000000000000..82076a8a9043b4c635905a3ff5817765da214beb --- /dev/null +++ b/detectron/tests/test_zero_even_op.py @@ -0,0 +1,127 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np +import unittest + +from caffe2.proto import caffe2_pb2 +from caffe2.python import core +from caffe2.python import workspace + +import detectron.utils.c2 as c2_utils + + +class ZeroEvenOpTest(unittest.TestCase): + + def _run_zero_even_op(self, X): + op = core.CreateOperator('ZeroEven', ['X'], ['Y']) + workspace.FeedBlob('X', X) + workspace.RunOperatorOnce(op) + Y = workspace.FetchBlob('Y') + return Y + + def _run_zero_even_op_gpu(self, X): + with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): + op = core.CreateOperator('ZeroEven', ['X'], ['Y']) + workspace.FeedBlob('X', X) + workspace.RunOperatorOnce(op) + Y = workspace.FetchBlob('Y') + return Y + + def test_throws_on_non_1D_arrays(self): + X = np.zeros((2, 2), dtype=np.float32) + with self.assertRaisesRegexp(RuntimeError, 'X\.ndim\(\) == 1'): + self._run_zero_even_op(X) + + def test_handles_empty_arrays(self): + X = np.array([], dtype=np.float32) + Y_exp = np.copy(X) + Y_act = self._run_zero_even_op(X) + np.testing.assert_allclose(Y_act, Y_exp) + + def test_sets_vals_at_even_inds_to_zero(self): + X = np.array([0, 1, 2, 3, 4], dtype=np.float32) + Y_exp = np.array([0, 1, 0, 3, 0], dtype=np.float32) + Y_act = self._run_zero_even_op(X) + np.testing.assert_allclose(Y_act[0::2], Y_exp[0::2]) + + def test_preserves_vals_at_odd_inds(self): + X = np.array([0, 1, 2, 3, 4], dtype=np.float32) + Y_exp = np.array([0, 1, 0, 3, 0], dtype=np.float32) + Y_act = self._run_zero_even_op(X) + np.testing.assert_allclose(Y_act[1::2], Y_exp[1::2]) + + def test_handles_even_length_arrays(self): + X = np.random.rand(64).astype(np.float32) + Y_exp = np.copy(X) + Y_exp[0::2] = 0.0 + Y_act = self._run_zero_even_op(X) + np.testing.assert_allclose(Y_act, Y_exp) + + def test_handles_odd_length_arrays(self): + X = np.random.randn(77).astype(np.float32) + Y_exp = np.copy(X) + Y_exp[0::2] = 0.0 + Y_act = self._run_zero_even_op(X) + np.testing.assert_allclose(Y_act, Y_exp) + + def test_gpu_throws_on_non_1D_arrays(self): + X = np.zeros((2, 2), dtype=np.float32) + with self.assertRaisesRegexp(RuntimeError, 'X\.ndim\(\) == 1'): + self._run_zero_even_op_gpu(X) + + def test_gpu_handles_empty_arrays(self): + X = np.array([], dtype=np.float32) + Y_exp = np.copy(X) + Y_act = self._run_zero_even_op_gpu(X) + np.testing.assert_allclose(Y_act, Y_exp) + + def test_gpu_sets_vals_at_even_inds_to_zero(self): + X = np.array([0, 1, 2, 3, 4], dtype=np.float32) + Y_exp = np.array([0, 1, 0, 3, 0], dtype=np.float32) + Y_act = self._run_zero_even_op_gpu(X) + np.testing.assert_allclose(Y_act[0::2], Y_exp[0::2]) + + def test_gpu_preserves_vals_at_odd_inds(self): + X = np.array([0, 1, 2, 3, 4], dtype=np.float32) + Y_exp = np.array([0, 1, 0, 3, 0], dtype=np.float32) + Y_act = self._run_zero_even_op_gpu(X) + np.testing.assert_allclose(Y_act[1::2], Y_exp[1::2]) + + def test_gpu_handles_even_length_arrays(self): + X = np.random.rand(64).astype(np.float32) + Y_exp = np.copy(X) + Y_exp[0::2] = 0.0 + Y_act = self._run_zero_even_op_gpu(X) + np.testing.assert_allclose(Y_act, Y_exp) + + def test_gpu_handles_odd_length_arrays(self): + X = np.random.randn(77).astype(np.float32) + Y_exp = np.copy(X) + Y_exp[0::2] = 0.0 + Y_act = self._run_zero_even_op_gpu(X) + np.testing.assert_allclose(Y_act, Y_exp) + + +if __name__ == '__main__': + workspace.GlobalInit(['caffe2', '--caffe2_log_level=0']) + c2_utils.import_custom_ops() + assert 'ZeroEven' in workspace.RegisteredOperators() + unittest.main() diff --git a/detectron/utils/__init__.py b/detectron/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/detectron/utils/blob.py b/detectron/utils/blob.py new file mode 100644 index 0000000000000000000000000000000000000000..1cf04baf4f8049d4b1e69f977bf6820b80fba1d1 --- /dev/null +++ b/detectron/utils/blob.py @@ -0,0 +1,181 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Caffe2 blob helper functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import numpy as np +from six.moves import cPickle as pickle + +from caffe2.proto import caffe2_pb2 + +from detectron.core.config import cfg + + +def get_image_blob(im, target_scale, target_max_size): + """Convert an image into a network input. + + Arguments: + im (ndarray): a color image in BGR order + + Returns: + blob (ndarray): a data blob holding an image pyramid + im_scale (float): image scale (target size) / (original size) + im_info (ndarray) + """ + processed_im, im_scale = prep_im_for_blob( + im, cfg.PIXEL_MEANS, target_scale, target_max_size + ) + blob = im_list_to_blob(processed_im) + # NOTE: this height and width may be larger than actual scaled input image + # due to the FPN.COARSEST_STRIDE related padding in im_list_to_blob. We are + # maintaining this behavior for now to make existing results exactly + # reproducible (in practice using the true input image height and width + # yields nearly the same results, but they are sometimes slightly different + # because predictions near the edge of the image will be pruned more + # aggressively). + height, width = blob.shape[2], blob.shape[3] + im_info = np.hstack((height, width, im_scale))[np.newaxis, :] + return blob, im_scale, im_info.astype(np.float32) + + +def im_list_to_blob(ims): + """Convert a list of images into a network input. Assumes images were + prepared using prep_im_for_blob or equivalent: i.e. + - BGR channel order + - pixel means subtracted + - resized to the desired input size + - float32 numpy ndarray format + Output is a 4D HCHW tensor of the images concatenated along axis 0 with + shape. + """ + if not isinstance(ims, list): + ims = [ims] + max_shape = np.array([im.shape for im in ims]).max(axis=0) + # Pad the image so they can be divisible by a stride + if cfg.FPN.FPN_ON: + stride = float(cfg.FPN.COARSEST_STRIDE) + max_shape[0] = int(np.ceil(max_shape[0] / stride) * stride) + max_shape[1] = int(np.ceil(max_shape[1] / stride) * stride) + + num_images = len(ims) + blob = np.zeros( + (num_images, max_shape[0], max_shape[1], 3), dtype=np.float32 + ) + for i in range(num_images): + im = ims[i] + blob[i, 0:im.shape[0], 0:im.shape[1], :] = im + # Move channels (axis 3) to axis 1 + # Axis order will become: (batch elem, channel, height, width) + channel_swap = (0, 3, 1, 2) + blob = blob.transpose(channel_swap) + return blob + + +def prep_im_for_blob(im, pixel_means, target_size, max_size): + """Prepare an image for use as a network input blob. Specially: + - Subtract per-channel pixel mean + - Convert to float32 + - Rescale to each of the specified target size (capped at max_size) + Returns a list of transformed images, one for each target size. Also returns + the scale factors that were used to compute each returned image. + """ + im = im.astype(np.float32, copy=False) + im -= pixel_means + im_shape = im.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + im_scale = float(target_size) / float(im_size_min) + # Prevent the biggest axis from being more than max_size + if np.round(im_scale * im_size_max) > max_size: + im_scale = float(max_size) / float(im_size_max) + im = cv2.resize( + im, + None, + None, + fx=im_scale, + fy=im_scale, + interpolation=cv2.INTER_LINEAR + ) + return im, im_scale + + +def zeros(shape, int32=False): + """Return a blob of all zeros of the given shape with the correct float or + int data type. + """ + return np.zeros(shape, dtype=np.int32 if int32 else np.float32) + + +def ones(shape, int32=False): + """Return a blob of all ones of the given shape with the correct float or + int data type. + """ + return np.ones(shape, dtype=np.int32 if int32 else np.float32) + + +def py_op_copy_blob(blob_in, blob_out): + """Copy a numpy ndarray given as blob_in into the Caffe2 CPUTensor blob + given as blob_out. Supports float32 and int32 blob data types. This function + is intended for copying numpy data into a Caffe2 blob in PythonOps. + """ + # Some awkward voodoo required by Caffe2 to support int32 blobs + needs_int32_init = False + try: + _ = blob.data.dtype # noqa + except Exception: + needs_int32_init = blob_in.dtype == np.int32 + if needs_int32_init: + # init can only take a list (failed on tuple) + blob_out.init(list(blob_in.shape), caffe2_pb2.TensorProto.INT32) + else: + blob_out.reshape(blob_in.shape) + blob_out.data[...] = blob_in + + +def get_loss_gradients(model, loss_blobs): + """Generate a gradient of 1 for each loss specified in 'loss_blobs'""" + loss_gradients = {} + for b in loss_blobs: + loss_grad = model.net.ConstantFill(b, [b + '_grad'], value=1.0) + loss_gradients[str(b)] = str(loss_grad) + return loss_gradients + + +def serialize(obj): + """Serialize a Python object using pickle and encode it as an array of + float32 values so that it can be feed into the workspace. See deserialize(). + """ + return np.fromstring(pickle.dumps(obj), dtype=np.uint8).astype(np.float32) + + +def deserialize(arr): + """Unserialize a Python object from an array of float32 values fetched from + a workspace. See serialize(). + """ + return pickle.loads(arr.astype(np.uint8).tobytes()) diff --git a/detectron/utils/boxes.py b/detectron/utils/boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..7564aabc8a492c96099d07a0e62fc6cba7458645 --- /dev/null +++ b/detectron/utils/boxes.py @@ -0,0 +1,338 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast/er R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Box manipulation functions. The internal Detectron box format is +[x1, y1, x2, y2] where (x1, y1) specify the top-left box corner and (x2, y2) +specify the bottom-right box corner. Boxes from external sources, e.g., +datasets, may be in other formats (such as [x, y, w, h]) and require conversion. + +This module uses a convention that may seem strange at first: the width of a box +is computed as x2 - x1 + 1 (likewise for height). The "+ 1" dates back to old +object detection days when the coordinates were integer pixel indices, rather +than floating point coordinates in a subpixel coordinate frame. A box with x2 = +x1 and y2 = y1 was taken to include a single pixel, having a width of 1, and +hence requiring the "+ 1". Now, most datasets will likely provide boxes with +floating point coordinates and the width should be more reasonably computed as +x2 - x1. + +In practice, as long as a model is trained and tested with a consistent +convention either decision seems to be ok (at least in our experience on COCO). +Since we have a long history of training models with the "+ 1" convention, we +are reluctant to change it even if our modern tastes prefer not to use it. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + +from detectron.core.config import cfg +import detectron.utils.cython_bbox as cython_bbox +import detectron.utils.cython_nms as cython_nms + +bbox_overlaps = cython_bbox.bbox_overlaps + + +def boxes_area(boxes): + """Compute the area of an array of boxes.""" + w = (boxes[:, 2] - boxes[:, 0] + 1) + h = (boxes[:, 3] - boxes[:, 1] + 1) + areas = w * h + assert np.all(areas >= 0), 'Negative areas founds' + return areas + + +def unique_boxes(boxes, scale=1.0): + """Return indices of unique boxes.""" + v = np.array([1, 1e3, 1e6, 1e9]) + hashes = np.round(boxes * scale).dot(v) + _, index = np.unique(hashes, return_index=True) + return np.sort(index) + + +def xywh_to_xyxy(xywh): + """Convert [x1 y1 w h] box format to [x1 y1 x2 y2] format.""" + if isinstance(xywh, (list, tuple)): + # Single box given as a list of coordinates + assert len(xywh) == 4 + x1, y1 = xywh[0], xywh[1] + x2 = x1 + np.maximum(0., xywh[2] - 1.) + y2 = y1 + np.maximum(0., xywh[3] - 1.) + return (x1, y1, x2, y2) + elif isinstance(xywh, np.ndarray): + # Multiple boxes given as a 2D ndarray + return np.hstack( + (xywh[:, 0:2], xywh[:, 0:2] + np.maximum(0, xywh[:, 2:4] - 1)) + ) + else: + raise TypeError('Argument xywh must be a list, tuple, or numpy array.') + + +def xyxy_to_xywh(xyxy): + """Convert [x1 y1 x2 y2] box format to [x1 y1 w h] format.""" + if isinstance(xyxy, (list, tuple)): + # Single box given as a list of coordinates + assert len(xyxy) == 4 + x1, y1 = xyxy[0], xyxy[1] + w = xyxy[2] - x1 + 1 + h = xyxy[3] - y1 + 1 + return (x1, y1, w, h) + elif isinstance(xyxy, np.ndarray): + # Multiple boxes given as a 2D ndarray + return np.hstack((xyxy[:, 0:2], xyxy[:, 2:4] - xyxy[:, 0:2] + 1)) + else: + raise TypeError('Argument xyxy must be a list, tuple, or numpy array.') + + +def filter_small_boxes(boxes, min_size): + """Keep boxes with width and height both greater than min_size.""" + w = boxes[:, 2] - boxes[:, 0] + 1 + h = boxes[:, 3] - boxes[:, 1] + 1 + keep = np.where((w > min_size) & (h > min_size))[0] + return keep + + +def clip_boxes_to_image(boxes, height, width): + """Clip an array of boxes to an image with the given height and width.""" + boxes[:, [0, 2]] = np.minimum(width - 1., np.maximum(0., boxes[:, [0, 2]])) + boxes[:, [1, 3]] = np.minimum(height - 1., np.maximum(0., boxes[:, [1, 3]])) + return boxes + + +def clip_xyxy_to_image(x1, y1, x2, y2, height, width): + """Clip coordinates to an image with the given height and width.""" + x1 = np.minimum(width - 1., np.maximum(0., x1)) + y1 = np.minimum(height - 1., np.maximum(0., y1)) + x2 = np.minimum(width - 1., np.maximum(0., x2)) + y2 = np.minimum(height - 1., np.maximum(0., y2)) + return x1, y1, x2, y2 + + +def clip_tiled_boxes(boxes, im_shape): + """Clip boxes to image boundaries. im_shape is [height, width] and boxes + has shape (N, 4 * num_tiled_boxes).""" + assert boxes.shape[1] % 4 == 0, \ + 'boxes.shape[1] is {:d}, but must be divisible by 4.'.format( + boxes.shape[1] + ) + # x1 >= 0 + boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) + # y1 >= 0 + boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) + # x2 < im_shape[1] + boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) + # y2 < im_shape[0] + boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) + return boxes + + +def bbox_transform(boxes, deltas, weights=(1.0, 1.0, 1.0, 1.0)): + """Forward transform that maps proposal boxes to predicted ground-truth + boxes using bounding-box regression deltas. See bbox_transform_inv for a + description of the weights argument. + """ + if boxes.shape[0] == 0: + return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) + + boxes = boxes.astype(deltas.dtype, copy=False) + + widths = boxes[:, 2] - boxes[:, 0] + 1.0 + heights = boxes[:, 3] - boxes[:, 1] + 1.0 + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = weights + dx = deltas[:, 0::4] / wx + dy = deltas[:, 1::4] / wy + dw = deltas[:, 2::4] / ww + dh = deltas[:, 3::4] / wh + + # Prevent sending too large values into np.exp() + dw = np.minimum(dw, cfg.BBOX_XFORM_CLIP) + dh = np.minimum(dh, cfg.BBOX_XFORM_CLIP) + + pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] + pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] + pred_w = np.exp(dw) * widths[:, np.newaxis] + pred_h = np.exp(dh) * heights[:, np.newaxis] + + pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) + # x1 + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w + # y1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h + # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 + # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 + + return pred_boxes + + +def bbox_transform_inv(boxes, gt_boxes, weights=(1.0, 1.0, 1.0, 1.0)): + """Inverse transform that computes target bounding-box regression deltas + given proposal boxes and ground-truth boxes. The weights argument should be + a 4-tuple of multiplicative weights that are applied to the regression + target. + + In older versions of this code (and in py-faster-rcnn), the weights were set + such that the regression deltas would have unit standard deviation on the + training dataset. Presently, rather than computing these statistics exactly, + we use a fixed set of weights (10., 10., 5., 5.) by default. These are + approximately the weights one would get from COCO using the previous unit + stdev heuristic. + """ + ex_widths = boxes[:, 2] - boxes[:, 0] + 1.0 + ex_heights = boxes[:, 3] - boxes[:, 1] + 1.0 + ex_ctr_x = boxes[:, 0] + 0.5 * ex_widths + ex_ctr_y = boxes[:, 1] + 0.5 * ex_heights + + gt_widths = gt_boxes[:, 2] - gt_boxes[:, 0] + 1.0 + gt_heights = gt_boxes[:, 3] - gt_boxes[:, 1] + 1.0 + gt_ctr_x = gt_boxes[:, 0] + 0.5 * gt_widths + gt_ctr_y = gt_boxes[:, 1] + 0.5 * gt_heights + + wx, wy, ww, wh = weights + targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = ww * np.log(gt_widths / ex_widths) + targets_dh = wh * np.log(gt_heights / ex_heights) + + targets = np.vstack((targets_dx, targets_dy, targets_dw, + targets_dh)).transpose() + return targets + + +def expand_boxes(boxes, scale): + """Expand an array of boxes by a given scale.""" + w_half = (boxes[:, 2] - boxes[:, 0]) * .5 + h_half = (boxes[:, 3] - boxes[:, 1]) * .5 + x_c = (boxes[:, 2] + boxes[:, 0]) * .5 + y_c = (boxes[:, 3] + boxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_exp = np.zeros(boxes.shape) + boxes_exp[:, 0] = x_c - w_half + boxes_exp[:, 2] = x_c + w_half + boxes_exp[:, 1] = y_c - h_half + boxes_exp[:, 3] = y_c + h_half + + return boxes_exp + + +def flip_boxes(boxes, im_width): + """Flip boxes horizontally.""" + boxes_flipped = boxes.copy() + boxes_flipped[:, 0::4] = im_width - boxes[:, 2::4] - 1 + boxes_flipped[:, 2::4] = im_width - boxes[:, 0::4] - 1 + return boxes_flipped + + +def aspect_ratio(boxes, aspect_ratio): + """Perform width-relative aspect ratio transformation.""" + boxes_ar = boxes.copy() + boxes_ar[:, 0::4] = aspect_ratio * boxes[:, 0::4] + boxes_ar[:, 2::4] = aspect_ratio * boxes[:, 2::4] + return boxes_ar + + +def box_voting(top_dets, all_dets, thresh, scoring_method='ID', beta=1.0): + """Apply bounding-box voting to refine `top_dets` by voting with `all_dets`. + See: https://arxiv.org/abs/1505.01749. Optional score averaging (not in the + referenced paper) can be applied by setting `scoring_method` appropriately. + """ + # top_dets is [N, 5] each row is [x1 y1 x2 y2, sore] + # all_dets is [N, 5] each row is [x1 y1 x2 y2, sore] + top_dets_out = top_dets.copy() + top_boxes = top_dets[:, :4] + all_boxes = all_dets[:, :4] + all_scores = all_dets[:, 4] + top_to_all_overlaps = bbox_overlaps(top_boxes, all_boxes) + for k in range(top_dets_out.shape[0]): + inds_to_vote = np.where(top_to_all_overlaps[k] >= thresh)[0] + boxes_to_vote = all_boxes[inds_to_vote, :] + ws = all_scores[inds_to_vote] + top_dets_out[k, :4] = np.average(boxes_to_vote, axis=0, weights=ws) + if scoring_method == 'ID': + # Identity, nothing to do + pass + elif scoring_method == 'TEMP_AVG': + # Average probabilities (considered as P(detected class) vs. + # P(not the detected class)) after smoothing with a temperature + # hyperparameter. + P = np.vstack((ws, 1.0 - ws)) + P_max = np.max(P, axis=0) + X = np.log(P / P_max) + X_exp = np.exp(X / beta) + P_temp = X_exp / np.sum(X_exp, axis=0) + P_avg = P_temp[0].mean() + top_dets_out[k, 4] = P_avg + elif scoring_method == 'AVG': + # Combine new probs from overlapping boxes + top_dets_out[k, 4] = ws.mean() + elif scoring_method == 'IOU_AVG': + P = ws + ws = top_to_all_overlaps[k, inds_to_vote] + P_avg = np.average(P, weights=ws) + top_dets_out[k, 4] = P_avg + elif scoring_method == 'GENERALIZED_AVG': + P_avg = np.mean(ws**beta)**(1.0 / beta) + top_dets_out[k, 4] = P_avg + elif scoring_method == 'QUASI_SUM': + top_dets_out[k, 4] = ws.sum() / float(len(ws))**beta + else: + raise NotImplementedError( + 'Unknown scoring method {}'.format(scoring_method) + ) + + return top_dets_out + + +def nms(dets, thresh): + """Apply classic DPM-style greedy NMS.""" + if dets.shape[0] == 0: + return [] + return cython_nms.nms(dets, thresh) + + +def soft_nms( + dets, sigma=0.5, overlap_thresh=0.3, score_thresh=0.001, method='linear' +): + """Apply the soft NMS algorithm from https://arxiv.org/abs/1704.04503.""" + if dets.shape[0] == 0: + return dets, [] + + methods = {'hard': 0, 'linear': 1, 'gaussian': 2} + assert method in methods, 'Unknown soft_nms method: {}'.format(method) + + dets, keep = cython_nms.soft_nms( + np.ascontiguousarray(dets, dtype=np.float32), + np.float32(sigma), + np.float32(overlap_thresh), + np.float32(score_thresh), + np.uint8(methods[method]) + ) + return dets, keep diff --git a/detectron/utils/c2.py b/detectron/utils/c2.py new file mode 100644 index 0000000000000000000000000000000000000000..ba0cb3ee8afe728b8c352f0b88e59fb6b261fc8b --- /dev/null +++ b/detectron/utils/c2.py @@ -0,0 +1,166 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Helpful utilities for working with Caffe2.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from six import string_types +import contextlib +import subprocess + +from caffe2.proto import caffe2_pb2 +from caffe2.python import core +from caffe2.python import dyndep +from caffe2.python import scope +from caffe2.python import workspace + +import detectron.utils.env as envu + + +def import_contrib_ops(): + """Import contrib ops needed by Detectron.""" + envu.import_nccl_ops() + + +def import_detectron_ops(): + """Import Detectron ops.""" + detectron_ops_lib = envu.get_detectron_ops_lib() + dyndep.InitOpsLibrary(detectron_ops_lib) + + +def import_custom_ops(): + """Import custom ops.""" + custom_ops_lib = envu.get_custom_ops_lib() + dyndep.InitOpsLibrary(custom_ops_lib) + + +def SuffixNet(name, net, prefix_len, outputs): + """Returns a new Net from the given Net (`net`) that includes only the ops + after removing the first `prefix_len` number of ops. The new Net is thus a + suffix of `net`. Blobs listed in `outputs` are registered as external output + blobs. + """ + outputs = BlobReferenceList(outputs) + for output in outputs: + assert net.BlobIsDefined(output) + new_net = net.Clone(name) + + del new_net.Proto().op[:] + del new_net.Proto().external_input[:] + del new_net.Proto().external_output[:] + + # Add suffix ops + new_net.Proto().op.extend(net.Proto().op[prefix_len:]) + # Add external input blobs + # Treat any undefined blobs as external inputs + input_names = [ + i for op in new_net.Proto().op for i in op.input + if not new_net.BlobIsDefined(i)] + new_net.Proto().external_input.extend(input_names) + # Add external output blobs + output_names = [str(o) for o in outputs] + new_net.Proto().external_output.extend(output_names) + return new_net, [new_net.GetBlobRef(o) for o in output_names] + + +def BlobReferenceList(blob_ref_or_list): + """Ensure that the argument is returned as a list of BlobReferences.""" + if isinstance(blob_ref_or_list, core.BlobReference): + return [blob_ref_or_list] + elif type(blob_ref_or_list) in (list, tuple): + for b in blob_ref_or_list: + assert isinstance(b, core.BlobReference) + return blob_ref_or_list + else: + raise TypeError( + 'blob_ref_or_list must be a BlobReference or a list/tuple of ' + 'BlobReferences' + ) + + +def UnscopeName(possibly_scoped_name): + """Remove any name scoping from a (possibly) scoped name. For example, + convert the name 'gpu_0/foo' to 'foo'.""" + assert isinstance(possibly_scoped_name, string_types) + return possibly_scoped_name[ + possibly_scoped_name.rfind(scope._NAMESCOPE_SEPARATOR) + 1:] + + +@contextlib.contextmanager +def NamedCudaScope(gpu_id): + """Creates a GPU name scope and CUDA device scope. This function is provided + to reduce `with ...` nesting levels.""" + with GpuNameScope(gpu_id): + with CudaScope(gpu_id): + yield + + +@contextlib.contextmanager +def GpuNameScope(gpu_id): + """Create a name scope for GPU device `gpu_id`.""" + with core.NameScope('gpu_{:d}'.format(gpu_id)): + yield + + +@contextlib.contextmanager +def CudaScope(gpu_id): + """Create a CUDA device scope for GPU device `gpu_id`.""" + gpu_dev = CudaDevice(gpu_id) + with core.DeviceScope(gpu_dev): + yield + + +@contextlib.contextmanager +def CpuScope(): + """Create a CPU device scope.""" + cpu_dev = core.DeviceOption(caffe2_pb2.CPU) + with core.DeviceScope(cpu_dev): + yield + + +def CudaDevice(gpu_id): + """Create a Cuda device.""" + return core.DeviceOption(caffe2_pb2.CUDA, gpu_id) + + +def gauss_fill(std): + """Gaussian fill helper to reduce verbosity.""" + return ('GaussianFill', {'std': std}) + + +def const_fill(value): + """Constant fill helper to reduce verbosity.""" + return ('ConstantFill', {'value': value}) + + +def get_nvidia_info(): + return ( + get_nvidia_smi_output(), + workspace.GetCUDAVersion(), + workspace.GetCuDNNVersion(), + ) + + +def get_nvidia_smi_output(): + try: + info = subprocess.check_output(["nvidia-smi"], stderr=subprocess.STDOUT) + info = info.decode("utf8") + except Exception as e: + info = "Executing nvidia-smi failed: " + str(e) + return info.strip() diff --git a/detectron/utils/collections.py b/detectron/utils/collections.py new file mode 100644 index 0000000000000000000000000000000000000000..e62ce79f80512407d2beed1657be1a889fd037bd --- /dev/null +++ b/detectron/utils/collections.py @@ -0,0 +1,66 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""A simple attribute dictionary used for representing configuration options.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + + +class AttrDict(dict): + + IMMUTABLE = '__immutable__' + + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__[AttrDict.IMMUTABLE] = False + + def __getattr__(self, name): + if name in self.__dict__: + return self.__dict__[name] + elif name in self: + return self[name] + else: + raise AttributeError(name) + + def __setattr__(self, name, value): + if not self.__dict__[AttrDict.IMMUTABLE]: + if name in self.__dict__: + self.__dict__[name] = value + else: + self[name] = value + else: + raise AttributeError( + 'Attempted to set "{}" to "{}", but AttrDict is immutable'. + format(name, value) + ) + + def immutable(self, is_immutable): + """Set immutability to is_immutable and recursively apply the setting + to all nested AttrDicts. + """ + self.__dict__[AttrDict.IMMUTABLE] = is_immutable + # Recursively set immutable state + for v in self.__dict__.values(): + if isinstance(v, AttrDict): + v.immutable(is_immutable) + for v in self.values(): + if isinstance(v, AttrDict): + v.immutable(is_immutable) + + def is_immutable(self): + return self.__dict__[AttrDict.IMMUTABLE] diff --git a/detectron/utils/colormap.py b/detectron/utils/colormap.py new file mode 100644 index 0000000000000000000000000000000000000000..bc6869f289a9c47519ca69bdddba3dd4fa82ea27 --- /dev/null +++ b/detectron/utils/colormap.py @@ -0,0 +1,113 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""An awesome colormap for really neat visualizations.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + + +def colormap(rgb=False): + color_list = np.array( + [ + 0.000, 0.447, 0.741, + 0.850, 0.325, 0.098, + 0.929, 0.694, 0.125, + 0.494, 0.184, 0.556, + 0.466, 0.674, 0.188, + 0.301, 0.745, 0.933, + 0.635, 0.078, 0.184, + 0.300, 0.300, 0.300, + 0.600, 0.600, 0.600, + 1.000, 0.000, 0.000, + 1.000, 0.500, 0.000, + 0.749, 0.749, 0.000, + 0.000, 1.000, 0.000, + 0.000, 0.000, 1.000, + 0.667, 0.000, 1.000, + 0.333, 0.333, 0.000, + 0.333, 0.667, 0.000, + 0.333, 1.000, 0.000, + 0.667, 0.333, 0.000, + 0.667, 0.667, 0.000, + 0.667, 1.000, 0.000, + 1.000, 0.333, 0.000, + 1.000, 0.667, 0.000, + 1.000, 1.000, 0.000, + 0.000, 0.333, 0.500, + 0.000, 0.667, 0.500, + 0.000, 1.000, 0.500, + 0.333, 0.000, 0.500, + 0.333, 0.333, 0.500, + 0.333, 0.667, 0.500, + 0.333, 1.000, 0.500, + 0.667, 0.000, 0.500, + 0.667, 0.333, 0.500, + 0.667, 0.667, 0.500, + 0.667, 1.000, 0.500, + 1.000, 0.000, 0.500, + 1.000, 0.333, 0.500, + 1.000, 0.667, 0.500, + 1.000, 1.000, 0.500, + 0.000, 0.333, 1.000, + 0.000, 0.667, 1.000, + 0.000, 1.000, 1.000, + 0.333, 0.000, 1.000, + 0.333, 0.333, 1.000, + 0.333, 0.667, 1.000, + 0.333, 1.000, 1.000, + 0.667, 0.000, 1.000, + 0.667, 0.333, 1.000, + 0.667, 0.667, 1.000, + 0.667, 1.000, 1.000, + 1.000, 0.000, 1.000, + 1.000, 0.333, 1.000, + 1.000, 0.667, 1.000, + 0.167, 0.000, 0.000, + 0.333, 0.000, 0.000, + 0.500, 0.000, 0.000, + 0.667, 0.000, 0.000, + 0.833, 0.000, 0.000, + 1.000, 0.000, 0.000, + 0.000, 0.167, 0.000, + 0.000, 0.333, 0.000, + 0.000, 0.500, 0.000, + 0.000, 0.667, 0.000, + 0.000, 0.833, 0.000, + 0.000, 1.000, 0.000, + 0.000, 0.000, 0.167, + 0.000, 0.000, 0.333, + 0.000, 0.000, 0.500, + 0.000, 0.000, 0.667, + 0.000, 0.000, 0.833, + 0.000, 0.000, 1.000, + 0.000, 0.000, 0.000, + 0.143, 0.143, 0.143, + 0.286, 0.286, 0.286, + 0.429, 0.429, 0.429, + 0.571, 0.571, 0.571, + 0.714, 0.714, 0.714, + 0.857, 0.857, 0.857, + 1.000, 1.000, 1.000 + ] + ).astype(np.float32) + color_list = color_list.reshape((-1, 3)) * 255 + if not rgb: + color_list = color_list[:, ::-1] + return color_list diff --git a/detectron/utils/coordinator.py b/detectron/utils/coordinator.py new file mode 100644 index 0000000000000000000000000000000000000000..62eb25be2ec9a2635a186e215257eb1a53d1fa53 --- /dev/null +++ b/detectron/utils/coordinator.py @@ -0,0 +1,73 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Coordinated access to a shared multithreading/processing queue.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import contextlib +import logging +import threading +import traceback +from six.moves import queue as Queue + +log = logging.getLogger(__name__) + + +class Coordinator(object): + + def __init__(self): + self._event = threading.Event() + + def request_stop(self): + log.debug('Coordinator stopping') + self._event.set() + + def should_stop(self): + return self._event.is_set() + + def wait_for_stop(self): + return self._event.wait() + + @contextlib.contextmanager + def stop_on_exception(self): + try: + yield + except Exception: + if not self.should_stop(): + traceback.print_exc() + self.request_stop() + + +def coordinated_get(coordinator, queue): + while not coordinator.should_stop(): + try: + return queue.get(block=True, timeout=1.0) + except Queue.Empty: + continue + raise Exception('Coordinator stopped during get()') + + +def coordinated_put(coordinator, queue, element): + while not coordinator.should_stop(): + try: + queue.put(element, block=True, timeout=1.0) + return + except Queue.Full: + continue + raise Exception('Coordinator stopped during put()') diff --git a/detectron/utils/cython_bbox.pyx b/detectron/utils/cython_bbox.pyx new file mode 100644 index 0000000000000000000000000000000000000000..4c1f015f96fd08441f3e90835767e3ea9adfa25d --- /dev/null +++ b/detectron/utils/cython_bbox.pyx @@ -0,0 +1,73 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Sergey Karayev +# -------------------------------------------------------- + +cimport cython +import numpy as np +cimport numpy as np + +DTYPE = np.float32 +ctypedef np.float32_t DTYPE_t + +@cython.boundscheck(False) +def bbox_overlaps( + np.ndarray[DTYPE_t, ndim=2] boxes, + np.ndarray[DTYPE_t, ndim=2] query_boxes): + """ + Parameters + ---------- + boxes: (N, 4) ndarray of float + query_boxes: (K, 4) ndarray of float + Returns + ------- + overlaps: (N, K) ndarray of overlap between boxes and query_boxes + """ + cdef unsigned int N = boxes.shape[0] + cdef unsigned int K = query_boxes.shape[0] + cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) + cdef DTYPE_t iw, ih, box_area + cdef DTYPE_t ua + cdef unsigned int k, n + with nogil: + for k in range(K): + box_area = ( + (query_boxes[k, 2] - query_boxes[k, 0] + 1) * + (query_boxes[k, 3] - query_boxes[k, 1] + 1) + ) + for n in range(N): + iw = ( + min(boxes[n, 2], query_boxes[k, 2]) - + max(boxes[n, 0], query_boxes[k, 0]) + 1 + ) + if iw > 0: + ih = ( + min(boxes[n, 3], query_boxes[k, 3]) - + max(boxes[n, 1], query_boxes[k, 1]) + 1 + ) + if ih > 0: + ua = float( + (boxes[n, 2] - boxes[n, 0] + 1) * + (boxes[n, 3] - boxes[n, 1] + 1) + + box_area - iw * ih + ) + overlaps[n, k] = iw * ih / ua + return overlaps diff --git a/detectron/utils/cython_nms.pyx b/detectron/utils/cython_nms.pyx new file mode 100644 index 0000000000000000000000000000000000000000..0c1785b4f97d1b315b16700ef0cd59cc77af3fe2 --- /dev/null +++ b/detectron/utils/cython_nms.pyx @@ -0,0 +1,203 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +cimport cython +import numpy as np +cimport numpy as np + +cdef inline np.float32_t max(np.float32_t a, np.float32_t b) nogil: + return a if a >= b else b + +cdef inline np.float32_t min(np.float32_t a, np.float32_t b) nogil: + return a if a <= b else b + +@cython.boundscheck(False) +@cython.cdivision(True) +@cython.wraparound(False) +def nms(np.ndarray[np.float32_t, ndim=2] dets, np.float32_t thresh): + cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] + cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] + cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] + cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] + cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] + + cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) + cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] + + cdef int ndets = dets.shape[0] + cdef np.ndarray[np.int_t, ndim=1] suppressed = \ + np.zeros((ndets), dtype=np.int) + + # nominal indices + cdef int _i, _j + # sorted indices + cdef int i, j + # temp variables for box i's (the box currently under consideration) + cdef np.float32_t ix1, iy1, ix2, iy2, iarea + # variables for computing overlap with box j (lower scoring box) + cdef np.float32_t xx1, yy1, xx2, yy2 + cdef np.float32_t w, h + cdef np.float32_t inter, ovr + + with nogil: + for _i in range(ndets): + i = order[_i] + if suppressed[i] == 1: + continue + ix1 = x1[i] + iy1 = y1[i] + ix2 = x2[i] + iy2 = y2[i] + iarea = areas[i] + for _j in range(_i + 1, ndets): + j = order[_j] + if suppressed[j] == 1: + continue + xx1 = max(ix1, x1[j]) + yy1 = max(iy1, y1[j]) + xx2 = min(ix2, x2[j]) + yy2 = min(iy2, y2[j]) + w = max(0.0, xx2 - xx1 + 1) + h = max(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (iarea + areas[j] - inter) + if ovr >= thresh: + suppressed[j] = 1 + + return np.where(suppressed == 0)[0] + +# ---------------------------------------------------------- +# Soft-NMS: Improving Object Detection With One Line of Code +# Copyright (c) University of Maryland, College Park +# Licensed under The MIT License [see LICENSE for details] +# Written by Navaneeth Bodla and Bharat Singh +# ---------------------------------------------------------- +@cython.boundscheck(False) +@cython.cdivision(True) +@cython.wraparound(False) +def soft_nms( + np.ndarray[float, ndim=2] boxes_in, + float sigma=0.5, + float Nt=0.3, + float threshold=0.001, + unsigned int method=0 +): + boxes = boxes_in.copy() + cdef unsigned int N = boxes.shape[0] + cdef float iw, ih, box_area + cdef float ua + cdef int pos = 0 + cdef float maxscore = 0 + cdef int maxpos = 0 + cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov + inds = np.arange(N) + + for i in range(N): + maxscore = boxes[i, 4] + maxpos = i + + tx1 = boxes[i,0] + ty1 = boxes[i,1] + tx2 = boxes[i,2] + ty2 = boxes[i,3] + ts = boxes[i,4] + ti = inds[i] + + pos = i + 1 + # get max box + while pos < N: + if maxscore < boxes[pos, 4]: + maxscore = boxes[pos, 4] + maxpos = pos + pos = pos + 1 + + # add max box as a detection + boxes[i,0] = boxes[maxpos,0] + boxes[i,1] = boxes[maxpos,1] + boxes[i,2] = boxes[maxpos,2] + boxes[i,3] = boxes[maxpos,3] + boxes[i,4] = boxes[maxpos,4] + inds[i] = inds[maxpos] + + # swap ith box with position of max box + boxes[maxpos,0] = tx1 + boxes[maxpos,1] = ty1 + boxes[maxpos,2] = tx2 + boxes[maxpos,3] = ty2 + boxes[maxpos,4] = ts + inds[maxpos] = ti + + tx1 = boxes[i,0] + ty1 = boxes[i,1] + tx2 = boxes[i,2] + ty2 = boxes[i,3] + ts = boxes[i,4] + + pos = i + 1 + # NMS iterations, note that N changes if detection boxes fall below + # threshold + while pos < N: + x1 = boxes[pos, 0] + y1 = boxes[pos, 1] + x2 = boxes[pos, 2] + y2 = boxes[pos, 3] + s = boxes[pos, 4] + + area = (x2 - x1 + 1) * (y2 - y1 + 1) + iw = (min(tx2, x2) - max(tx1, x1) + 1) + if iw > 0: + ih = (min(ty2, y2) - max(ty1, y1) + 1) + if ih > 0: + ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) + ov = iw * ih / ua #iou between max box and detection box + + if method == 1: # linear + if ov > Nt: + weight = 1 - ov + else: + weight = 1 + elif method == 2: # gaussian + weight = np.exp(-(ov * ov)/sigma) + else: # original NMS + if ov > Nt: + weight = 0 + else: + weight = 1 + + boxes[pos, 4] = weight*boxes[pos, 4] + + # if box score falls below threshold, discard the box by + # swapping with last box update N + if boxes[pos, 4] < threshold: + boxes[pos,0] = boxes[N-1, 0] + boxes[pos,1] = boxes[N-1, 1] + boxes[pos,2] = boxes[N-1, 2] + boxes[pos,3] = boxes[N-1, 3] + boxes[pos,4] = boxes[N-1, 4] + inds[pos] = inds[N-1] + N = N - 1 + pos = pos - 1 + + pos = pos + 1 + + return boxes[:N], inds[:N] diff --git a/detectron/utils/env.py b/detectron/utils/env.py new file mode 100644 index 0000000000000000000000000000000000000000..128162fcf6929a3f40eae577dcb8560c31e29a2b --- /dev/null +++ b/detectron/utils/env.py @@ -0,0 +1,91 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Environment helper functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import sys +import yaml + +# Default value of the CMake install prefix +_CMAKE_INSTALL_PREFIX = '/usr/local' +# Detectron ops lib +_DETECTRON_OPS_LIB = 'libcaffe2_detectron_ops_gpu.so' + + +def get_runtime_dir(): + """Retrieve the path to the runtime directory.""" + return sys.path[0] + + +def get_py_bin_ext(): + """Retrieve python binary extension.""" + return '.py' + + +def set_up_matplotlib(): + """Set matplotlib up.""" + import matplotlib + # Use a non-interactive backend + matplotlib.use('Agg') + + +def exit_on_error(): + """Exit from a detectron tool when there's an error.""" + sys.exit(1) + + +def import_nccl_ops(): + """Import NCCL ops.""" + # There is no need to load NCCL ops since the + # NCCL dependency is built into the Caffe2 gpu lib + pass + + +def get_detectron_ops_lib(): + """Retrieve Detectron ops library.""" + # Candidate prefixes for detectron ops lib path + prefixes = [_CMAKE_INSTALL_PREFIX, sys.prefix, sys.exec_prefix] + sys.path + # Candidate subdirs for detectron ops lib + subdirs = ['lib', 'torch/lib'] + # Try to find detectron ops lib + for prefix in prefixes: + for subdir in subdirs: + ops_path = os.path.join(prefix, subdir, _DETECTRON_OPS_LIB) + if os.path.exists(ops_path): + print('Found Detectron ops lib: {}'.format(ops_path)) + return ops_path + raise Exception('Detectron ops lib not found') + + +def get_custom_ops_lib(): + """Retrieve custom ops library.""" + det_dir, _ = os.path.split(os.path.dirname(__file__)) + root_dir, _ = os.path.split(det_dir) + custom_ops_lib = os.path.join( + root_dir, 'build/libcaffe2_detectron_custom_ops_gpu.so') + assert os.path.exists(custom_ops_lib), \ + 'Custom ops lib not found at \'{}\''.format(custom_ops_lib) + return custom_ops_lib + + +# YAML load/dump function aliases +yaml_load = yaml.load +yaml_dump = yaml.dump diff --git a/detectron/utils/image.py b/detectron/utils/image.py new file mode 100644 index 0000000000000000000000000000000000000000..f7a5d3652075da44a377be4243d5885732e7ab0f --- /dev/null +++ b/detectron/utils/image.py @@ -0,0 +1,45 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Image helper functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import numpy as np + + +def aspect_ratio_rel(im, aspect_ratio): + """Performs width-relative aspect ratio transformation.""" + im_h, im_w = im.shape[:2] + im_ar_w = int(round(aspect_ratio * im_w)) + im_ar = cv2.resize(im, dsize=(im_ar_w, im_h)) + return im_ar + + +def aspect_ratio_abs(im, aspect_ratio): + """Performs absolute aspect ratio transformation.""" + im_h, im_w = im.shape[:2] + im_area = im_h * im_w + + im_ar_w = np.sqrt(im_area * aspect_ratio) + im_ar_h = np.sqrt(im_area / aspect_ratio) + assert np.isclose(im_ar_w / im_ar_h, aspect_ratio) + + im_ar = cv2.resize(im, dsize=(int(im_ar_w), int(im_ar_h))) + return im_ar diff --git a/detectron/utils/io.py b/detectron/utils/io.py new file mode 100644 index 0000000000000000000000000000000000000000..4501b0a3f10ccf92a4e31d8fea1ac5b1bb4c680c --- /dev/null +++ b/detectron/utils/io.py @@ -0,0 +1,192 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""IO utilities.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import errno +import hashlib +import logging +import os +import re +import six +import sys +from six.moves import cPickle as pickle +from six.moves import urllib +from uuid import uuid4 + +logger = logging.getLogger(__name__) + +_DETECTRON_S3_BASE_URL = 'https://dl.fbaipublicfiles.com/detectron' + + +def save_object(obj, file_name, pickle_format=2): + """Save a Python object by pickling it. + +Unless specifically overridden, we want to save it in Pickle format=2 since this +will allow other Python2 executables to load the resulting Pickle. When we want +to completely remove Python2 backward-compatibility, we can bump it up to 3. We +should never use pickle.HIGHEST_PROTOCOL as far as possible if the resulting +file is manifested or used, external to the system. + """ + file_name = os.path.abspath(file_name) + # Avoid filesystem race conditions (particularly on network filesystems) + # by saving to a random tmp file on the same filesystem, and then + # atomically rename to the target filename. + tmp_file_name = file_name + ".tmp." + uuid4().hex + try: + with open(tmp_file_name, 'wb') as f: + pickle.dump(obj, f, pickle_format) + f.flush() # make sure it's written to disk + os.fsync(f.fileno()) + os.rename(tmp_file_name, file_name) + finally: + # Clean up the temp file on failure. Rather than using os.path.exists(), + # which can be unreliable on network filesystems, attempt to delete and + # ignore os errors. + try: + os.remove(tmp_file_name) + except EnvironmentError as e: # parent class of IOError, OSError + if getattr(e, 'errno', None) != errno.ENOENT: # We expect ENOENT + logger.info("Could not delete temp file %r", + tmp_file_name, exc_info=True) + # pass through since we don't want the job to crash + + +def load_object(file_name): + with open(file_name, 'rb') as f: + # The default encoding used while unpickling is 7-bit (ASCII.) However, + # the blobs are arbitrary 8-bit bytes which don't agree. The absolute + # correct way to do this is to use `encoding="bytes"` and then interpret + # the blob names either as ASCII, or better, as unicode utf-8. A + # reasonable fix, however, is to treat it the encoding as 8-bit latin1 + # (which agrees with the first 256 characters of Unicode anyway.) + if six.PY2: + return pickle.load(f) + else: + return pickle.load(f, encoding='latin1') + + +def cache_url(url_or_file, cache_dir): + """Download the file specified by the URL to the cache_dir and return the + path to the cached file. If the argument is not a URL, simply return it as + is. + """ + is_url = re.match( + r'^(?:http)s?://', url_or_file, re.IGNORECASE + ) is not None + + if not is_url: + return url_or_file + + url = url_or_file + assert url.startswith(_DETECTRON_S3_BASE_URL), \ + ('Detectron only automatically caches URLs in the Detectron S3 ' + 'bucket: {}').format(_DETECTRON_S3_BASE_URL) + + cache_file_path = url.replace(_DETECTRON_S3_BASE_URL, cache_dir) + if os.path.exists(cache_file_path): + assert_cache_file_is_ok(url, cache_file_path) + return cache_file_path + + cache_file_dir = os.path.dirname(cache_file_path) + if not os.path.exists(cache_file_dir): + os.makedirs(cache_file_dir) + + logger.info('Downloading remote file {} to {}'.format(url, cache_file_path)) + download_url(url, cache_file_path) + assert_cache_file_is_ok(url, cache_file_path) + return cache_file_path + + +def assert_cache_file_is_ok(url, file_path): + """Check that cache file has the correct hash.""" + # File is already in the cache, verify that the md5sum matches and + # return local path + cache_file_md5sum = _get_file_md5sum(file_path) + ref_md5sum = _get_reference_md5sum(url) + assert cache_file_md5sum == ref_md5sum, \ + ('Target URL {} appears to be downloaded to the local cache file ' + '{}, but the md5 hash of the local file does not match the ' + 'reference (actual: {} vs. expected: {}). You may wish to delete ' + 'the cached file and try again to trigger automatic ' + 'download.').format(url, file_path, cache_file_md5sum, ref_md5sum) + + +def _progress_bar(count, total): + """Report download progress. + Credit: + https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113 + """ + bar_len = 60 + filled_len = int(round(bar_len * count / float(total))) + + percents = round(100.0 * count / float(total), 1) + bar = '=' * filled_len + '-' * (bar_len - filled_len) + + sys.stdout.write( + ' [{}] {}% of {:.1f}MB file \r'. + format(bar, percents, total / 1024 / 1024) + ) + sys.stdout.flush() + if count >= total: + sys.stdout.write('\n') + + +def download_url( + url, dst_file_path, chunk_size=8192, progress_hook=_progress_bar +): + """Download url and write it to dst_file_path. + Credit: + https://stackoverflow.com/questions/2028517/python-urllib2-progress-hook + """ + response = urllib.request.urlopen(url) + if six.PY2: + total_size = response.info().getheader('Content-Length').strip() + else: + total_size = response.info().get('Content-Length').strip() + total_size = int(total_size) + bytes_so_far = 0 + + with open(dst_file_path, 'wb') as f: + while 1: + chunk = response.read(chunk_size) + bytes_so_far += len(chunk) + if not chunk: + break + if progress_hook: + progress_hook(bytes_so_far, total_size) + f.write(chunk) + + return bytes_so_far + + +def _get_file_md5sum(file_name): + """Compute the md5 hash of a file.""" + hash_obj = hashlib.md5() + with open(file_name, 'rb') as f: + hash_obj.update(f.read()) + return hash_obj.hexdigest().encode('utf-8') + + +def _get_reference_md5sum(url): + """By convention the md5 hash for url is stored in url + '.md5sum'.""" + url_md5sum = url + '.md5sum' + md5sum = urllib.request.urlopen(url_md5sum).read().strip() + return md5sum diff --git a/detectron/utils/keypoints.py b/detectron/utils/keypoints.py new file mode 100644 index 0000000000000000000000000000000000000000..b305cea2d6d4e527da226645d44cf34328f02cc6 --- /dev/null +++ b/detectron/utils/keypoints.py @@ -0,0 +1,266 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Keypoint utilities (somewhat specific to COCO keypoints).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import numpy as np + +from detectron.core.config import cfg +import detectron.utils.blob as blob_utils + + +def get_keypoints(): + """Get the COCO keypoints and their left/right flip coorespondence map.""" + # Keypoints are not available in the COCO json for the test split, so we + # provide them here. + keypoints = [ + 'nose', + 'left_eye', + 'right_eye', + 'left_ear', + 'right_ear', + 'left_shoulder', + 'right_shoulder', + 'left_elbow', + 'right_elbow', + 'left_wrist', + 'right_wrist', + 'left_hip', + 'right_hip', + 'left_knee', + 'right_knee', + 'left_ankle', + 'right_ankle' + ] + keypoint_flip_map = { + 'left_eye': 'right_eye', + 'left_ear': 'right_ear', + 'left_shoulder': 'right_shoulder', + 'left_elbow': 'right_elbow', + 'left_wrist': 'right_wrist', + 'left_hip': 'right_hip', + 'left_knee': 'right_knee', + 'left_ankle': 'right_ankle' + } + return keypoints, keypoint_flip_map + + +def get_person_class_index(): + """Index of the person class in COCO.""" + return 1 + + +def flip_keypoints(keypoints, keypoint_flip_map, keypoint_coords, width): + """Left/right flip keypoint_coords. keypoints and keypoint_flip_map are + accessible from get_keypoints(). + """ + flipped_kps = keypoint_coords.copy() + for lkp, rkp in keypoint_flip_map.items(): + lid = keypoints.index(lkp) + rid = keypoints.index(rkp) + flipped_kps[:, :, lid] = keypoint_coords[:, :, rid] + flipped_kps[:, :, rid] = keypoint_coords[:, :, lid] + + # Flip x coordinates + flipped_kps[:, 0, :] = width - flipped_kps[:, 0, :] - 1 + # Maintain COCO convention that if visibility == 0, then x, y = 0 + inds = np.where(flipped_kps[:, 2, :] == 0) + flipped_kps[inds[0], 0, inds[1]] = 0 + return flipped_kps + + +def flip_heatmaps(heatmaps): + """Flip heatmaps horizontally.""" + keypoints, flip_map = get_keypoints() + heatmaps_flipped = heatmaps.copy() + for lkp, rkp in flip_map.items(): + lid = keypoints.index(lkp) + rid = keypoints.index(rkp) + heatmaps_flipped[:, rid, :, :] = heatmaps[:, lid, :, :] + heatmaps_flipped[:, lid, :, :] = heatmaps[:, rid, :, :] + heatmaps_flipped = heatmaps_flipped[:, :, :, ::-1] + return heatmaps_flipped + + +def heatmaps_to_keypoints(maps, rois): + """Extract predicted keypoint locations from heatmaps. Output has shape + (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob) + for each keypoint. + """ + # This function converts a discrete image coordinate in a HEATMAP_SIZE x + # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain + # consistency with keypoints_to_heatmap_labels by using the conversion from + # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a + # continuous coordinate. + offset_x = rois[:, 0] + offset_y = rois[:, 1] + + widths = rois[:, 2] - rois[:, 0] + heights = rois[:, 3] - rois[:, 1] + widths = np.maximum(widths, 1) + heights = np.maximum(heights, 1) + widths_ceil = np.ceil(widths) + heights_ceil = np.ceil(heights) + + # NCHW to NHWC for use with OpenCV + maps = np.transpose(maps, [0, 2, 3, 1]) + min_size = cfg.KRCNN.INFERENCE_MIN_SIZE + xy_preds = np.zeros( + (len(rois), 4, cfg.KRCNN.NUM_KEYPOINTS), dtype=np.float32) + for i in range(len(rois)): + if min_size > 0: + roi_map_width = int(np.maximum(widths_ceil[i], min_size)) + roi_map_height = int(np.maximum(heights_ceil[i], min_size)) + else: + roi_map_width = widths_ceil[i] + roi_map_height = heights_ceil[i] + width_correction = widths[i] / roi_map_width + height_correction = heights[i] / roi_map_height + roi_map = cv2.resize( + maps[i], (roi_map_width, roi_map_height), + interpolation=cv2.INTER_CUBIC) + # Bring back to CHW + roi_map = np.transpose(roi_map, [2, 0, 1]) + roi_map_probs = scores_to_probs(roi_map.copy()) + w = roi_map.shape[2] + for k in range(cfg.KRCNN.NUM_KEYPOINTS): + pos = roi_map[k, :, :].argmax() + x_int = pos % w + y_int = (pos - x_int) // w + assert (roi_map_probs[k, y_int, x_int] == + roi_map_probs[k, :, :].max()) + x = (x_int + 0.5) * width_correction + y = (y_int + 0.5) * height_correction + xy_preds[i, 0, k] = x + offset_x[i] + xy_preds[i, 1, k] = y + offset_y[i] + xy_preds[i, 2, k] = roi_map[k, y_int, x_int] + xy_preds[i, 3, k] = roi_map_probs[k, y_int, x_int] + + return xy_preds + + +def keypoints_to_heatmap_labels(keypoints, rois): + """Encode keypoint location in the target heatmap for use in + SoftmaxWithLoss. + """ + # Maps keypoints from the half-open interval [x1, x2) on continuous image + # coordinates to the closed interval [0, HEATMAP_SIZE - 1] on discrete image + # coordinates. We use the continuous <-> discrete conversion from Heckbert + # 1990 ("What is the coordinate of a pixel?"): d = floor(c) and c = d + 0.5, + # where d is a discrete coordinate and c is a continuous coordinate. + assert keypoints.shape[2] == cfg.KRCNN.NUM_KEYPOINTS + + shape = (len(rois), cfg.KRCNN.NUM_KEYPOINTS) + heatmaps = blob_utils.zeros(shape) + weights = blob_utils.zeros(shape) + + offset_x = rois[:, 0] + offset_y = rois[:, 1] + scale_x = cfg.KRCNN.HEATMAP_SIZE / (rois[:, 2] - rois[:, 0]) + scale_y = cfg.KRCNN.HEATMAP_SIZE / (rois[:, 3] - rois[:, 1]) + + for kp in range(keypoints.shape[2]): + vis = keypoints[:, 2, kp] > 0 + x = keypoints[:, 0, kp].astype(np.float32) + y = keypoints[:, 1, kp].astype(np.float32) + # Since we use floor below, if a keypoint is exactly on the roi's right + # or bottom boundary, we shift it in by eps (conceptually) to keep it in + # the ground truth heatmap. + x_boundary_inds = np.where(x == rois[:, 2])[0] + y_boundary_inds = np.where(y == rois[:, 3])[0] + x = (x - offset_x) * scale_x + x = np.floor(x) + if len(x_boundary_inds) > 0: + x[x_boundary_inds] = cfg.KRCNN.HEATMAP_SIZE - 1 + + y = (y - offset_y) * scale_y + y = np.floor(y) + if len(y_boundary_inds) > 0: + y[y_boundary_inds] = cfg.KRCNN.HEATMAP_SIZE - 1 + + valid_loc = np.logical_and( + np.logical_and(x >= 0, y >= 0), + np.logical_and( + x < cfg.KRCNN.HEATMAP_SIZE, y < cfg.KRCNN.HEATMAP_SIZE)) + + valid = np.logical_and(valid_loc, vis) + valid = valid.astype(np.int32) + + lin_ind = y * cfg.KRCNN.HEATMAP_SIZE + x + heatmaps[:, kp] = lin_ind * valid + weights[:, kp] = valid + + return heatmaps, weights + + +def scores_to_probs(scores): + """Transforms CxHxW of scores to probabilities spatially.""" + channels = scores.shape[0] + for c in range(channels): + temp = scores[c, :, :] + max_score = temp.max() + temp = np.exp(temp - max_score) / np.sum(np.exp(temp - max_score)) + scores[c, :, :] = temp + return scores + + +def nms_oks(kp_predictions, rois, thresh): + """Nms based on kp predictions.""" + scores = np.mean(kp_predictions[:, 2, :], axis=1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = compute_oks( + kp_predictions[i], rois[i], kp_predictions[order[1:]], + rois[order[1:]]) + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return keep + + +def compute_oks(src_keypoints, src_roi, dst_keypoints, dst_roi): + """Compute OKS for predicted keypoints wrt gt_keypoints. + src_keypoints: 4xK + src_roi: 4x1 + dst_keypoints: Nx4xK + dst_roi: Nx4 + """ + + sigmas = np.array([ + .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, + .87, .89, .89]) / 10.0 + vars = (sigmas * 2)**2 + + # area + src_area = (src_roi[2] - src_roi[0] + 1) * (src_roi[3] - src_roi[1] + 1) + + # measure the per-keypoint distance if keypoints visible + dx = dst_keypoints[:, 0, :] - src_keypoints[0, :] + dy = dst_keypoints[:, 1, :] - src_keypoints[1, :] + + e = (dx**2 + dy**2) / vars / (src_area + np.spacing(1)) / 2 + e = np.sum(np.exp(-e), axis=1) / e.shape[1] + + return e diff --git a/detectron/utils/logging.py b/detectron/utils/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..eba6dfb84c30cb3c3c01850d7efc4af72f30447c --- /dev/null +++ b/detectron/utils/logging.py @@ -0,0 +1,83 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Utilities for logging.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import deque +from email.mime.text import MIMEText +import json +import logging +import numpy as np +import smtplib +import sys + + +def log_json_stats(stats, sort_keys=True): + # hack to control precision of top-level floats + stats = { + k: '{:.6f}'.format(v) if isinstance(v, float) else v + for k, v in stats.items() + } + print('json_stats: {:s}'.format(json.dumps(stats, sort_keys=sort_keys))) + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size): + self.deque = deque(maxlen=window_size) + self.series = [] + self.total = 0.0 + self.count = 0 + + def AddValue(self, value): + self.deque.append(value) + self.series.append(value) + self.count += 1 + self.total += value + + def GetMedianValue(self): + return np.median(self.deque) + + def GetAverageValue(self): + return np.mean(self.deque) + + def GetGlobalAverageValue(self): + return self.total / self.count + + +def send_email(subject, body, to): + s = smtplib.SMTP('localhost') + mime = MIMEText(body) + mime['Subject'] = subject + mime['To'] = to + s.sendmail('detectron', to, mime.as_string()) + + +def setup_logging(name): + FORMAT = '%(levelname)s %(filename)s:%(lineno)4d: %(message)s' + # Manually clear root loggers to prevent any module that may have called + # logging.basicConfig() from blocking our logging setup + logging.root.handlers = [] + logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout) + logger = logging.getLogger(name) + return logger diff --git a/detectron/utils/lr_policy.py b/detectron/utils/lr_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..92391b18e179b915c2b8e72b165cf422c1235c04 --- /dev/null +++ b/detectron/utils/lr_policy.py @@ -0,0 +1,131 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Learning rate policies.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + +from detectron.core.config import cfg + + +def get_lr_at_iter(it): + """Get the learning rate at iteration it according to the cfg.SOLVER + settings. + """ + lr = get_lr_func()(it) + if it < cfg.SOLVER.WARM_UP_ITERS: + method = cfg.SOLVER.WARM_UP_METHOD + if method == 'constant': + warmup_factor = cfg.SOLVER.WARM_UP_FACTOR + elif method == 'linear': + alpha = it / cfg.SOLVER.WARM_UP_ITERS + warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha + else: + raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) + lr *= warmup_factor + return np.float32(lr) + + +# ---------------------------------------------------------------------------- # +# Learning rate policy functions +# ---------------------------------------------------------------------------- # + +def lr_func_steps_with_lrs(cur_iter): + """For cfg.SOLVER.LR_POLICY = 'steps_with_lrs' + + Change the learning rate to specified values at specified iterations. + + Example: + cfg.SOLVER.MAX_ITER: 90 + cfg.SOLVER.STEPS: [0, 60, 80] + cfg.SOLVER.LRS: [0.02, 0.002, 0.0002] + for cur_iter in [0, 59] use 0.02 + in [60, 79] use 0.002 + in [80, inf] use 0.0002 + """ + ind = get_step_index(cur_iter) + return cfg.SOLVER.LRS[ind] + + +def lr_func_steps_with_decay(cur_iter): + """For cfg.SOLVER.LR_POLICY = 'steps_with_decay' + + Change the learning rate specified iterations based on the formula + lr = base_lr * gamma ** lr_step_count. + + Example: + cfg.SOLVER.MAX_ITER: 90 + cfg.SOLVER.STEPS: [0, 60, 80] + cfg.SOLVER.BASE_LR: 0.02 + cfg.SOLVER.GAMMA: 0.1 + for cur_iter in [0, 59] use 0.02 = 0.02 * 0.1 ** 0 + in [60, 79] use 0.002 = 0.02 * 0.1 ** 1 + in [80, inf] use 0.0002 = 0.02 * 0.1 ** 2 + """ + ind = get_step_index(cur_iter) + return cfg.SOLVER.BASE_LR * cfg.SOLVER.GAMMA ** ind + + +def lr_func_step(cur_iter): + """For cfg.SOLVER.LR_POLICY = 'step' + """ + return ( + cfg.SOLVER.BASE_LR * + cfg.SOLVER.GAMMA ** (cur_iter // cfg.SOLVER.STEP_SIZE)) + + +def lr_func_cosine_decay(cur_iter): + """For cfg.SOLVER.LR_POLICY = 'cosine_decay' + """ + iter_frac = float(cur_iter) / cfg.SOLVER.MAX_ITER + cos_frac = 0.5 * (np.cos(np.pi * iter_frac) + 1) + return cfg.SOLVER.BASE_LR * cos_frac + + +def lr_func_exp_decay(cur_iter): + """For cfg.SOLVER.LR_POLICY = 'exp_decay' + """ + # GAMMA is final/initial learning rate ratio + iter_frac = float(cur_iter) / cfg.SOLVER.MAX_ITER + exp_frac = np.exp(iter_frac * np.log(cfg.SOLVER.GAMMA)) + return cfg.SOLVER.BASE_LR * exp_frac + + +# ---------------------------------------------------------------------------- # +# Helpers +# ---------------------------------------------------------------------------- # + +def get_step_index(cur_iter): + """Given an iteration, find which learning rate step we're at.""" + assert cfg.SOLVER.STEPS[0] == 0, 'The first step should always start at 0.' + steps = cfg.SOLVER.STEPS + [cfg.SOLVER.MAX_ITER] + for ind, step in enumerate(steps): # NoQA + if cur_iter < step: + break + return ind - 1 + + +def get_lr_func(): + policy = 'lr_func_' + cfg.SOLVER.LR_POLICY + if policy not in globals(): + raise NotImplementedError( + 'Unknown LR policy: {}'.format(cfg.SOLVER.LR_POLICY)) + else: + return globals()[policy] diff --git a/detectron/utils/model_convert_utils.py b/detectron/utils/model_convert_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7c2b2738dcf9161fa943f45ec195b93cb380c92c --- /dev/null +++ b/detectron/utils/model_convert_utils.py @@ -0,0 +1,406 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +'''Helper functions for model conversion to pb''' + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from functools import wraps +import copy +import numpy as np + +from caffe2.python import core, workspace +from caffe2.proto import caffe2_pb2 + + +class OpFilter(object): + def __init__(self, **kwargs): + self.type = None + self.type_in = None + self.inputs = None + self.outputs = None + self.input_has = None + self.output_has = None + self.cond = None + self.reverse = False + + assert all([x in self.__dict__ for x in kwargs]) + self.__dict__.update(kwargs) + + def check(self, op): + ret = self.reverse + if self.type and op.type != self.type: + return ret + if self.type_in and op.type not in self.type_in: + return ret + if self.inputs and set(op.input) != set(self.inputs): + return ret + if self.outputs and set(op.output) != set(self.outputs): + return ret + if self.input_has and self.input_has not in op.input: + return ret + if self.output_has and self.output_has not in op.output: + return ret + if self.cond is not None and not self.cond: + return ret + return not ret + + +def filter_op(op, **kwargs): + ''' Returns true if passed all checks ''' + return OpFilter(**kwargs).check(op) + + +def op_filter(**filter_args): + ''' Returns None if no condition is satisfied ''' + def actual_decorator(f): + @wraps(f) + def wrapper(op, **params): + if not filter_op(op, **filter_args): + return None + return f(op, **params) + return wrapper + return actual_decorator + + +def op_func_chain(convert_func_list): + ''' Run funcs one by one until func return is not None ''' + assert isinstance(convert_func_list, list) + + def _chain(op): + for x in convert_func_list: + ret = x(op) + if ret is not None: + return ret + return None + + return _chain + + +def convert_op_in_ops(ops_ref, func_or_list): + func = func_or_list + if isinstance(func_or_list, list): + func = op_func_chain(func_or_list) + ops = [op for op in ops_ref] + converted_ops = [] + for op in ops: + new_ops = func(op) + if new_ops is not None and not isinstance(new_ops, list): + new_ops = [new_ops] + converted_ops.extend(new_ops if new_ops is not None else [op]) + del ops_ref[:] + # ops_ref maybe of type RepeatedCompositeFieldContainer + # which does not have append() + ops_ref.extend(converted_ops) + + +def convert_op_in_proto(proto, func_or_list): + convert_op_in_ops(proto.op, func_or_list) + + +def get_op_arg(op, arg_name): + for x in op.arg: + if x.name == arg_name: + return x + return None + + +def get_op_arg_valf(op, arg_name, default_val): + arg = get_op_arg(op, arg_name) + return arg.f if arg is not None else default_val + + +def update_mobile_engines(net): + for op in net.op: + if op.type == "Conv": + op.engine = "NNPACK" + if op.type == "ConvTranspose": + op.engine = "BLOCK" + + +def pairwise(iterable): + "s -> (s0,s1), (s1,s2), (s2, s3), ..." + from itertools import tee + a, b = tee(iterable) + next(b, None) + return zip(a, b) + + +def blob_uses(net, blob): + u = [] + for i, op in enumerate(net.op): + if blob in op.input or blob in op.control_input: + u.append(i) + return u + + +def fuse_first_affine(net, params, removed_tensors): + net = copy.deepcopy(net) + params = copy.deepcopy(params) + + for ((i, current), (j, next_)) in pairwise(enumerate(net.op)): + if next_.input[0] != current.output[0]: + continue + + if current.type not in ("Conv", "ConvTranspose") \ + or next_.type != "AffineChannel": + continue + if current.output[0] != next_.output[0] and \ + len(blob_uses(net, current.output[0])) != 1: + # Can't fuse if more than one user unless AffineChannel is inplace + continue + + # else, can fuse + conv = current + affine = next_ + fused_conv = copy.deepcopy(conv) + fused_conv.output[0] = affine.output[0] + conv_weight = params[conv.input[1]] + conv_has_bias = len(conv.input) > 2 + conv_bias = params[conv.input[2]] if conv_has_bias else 0 + + A = params[affine.input[1]] + B = params[affine.input[2]] + + # Thus, can just have the affine transform + # X * A + B + # where + # A = bn_scale * 1.0 / (sqrt(running_var + eps)) + # B = (bias - running_mean * (1.0 / sqrt(running_var + eps)) + # * bn_scale) + + # This identify should hold if we have correctly fused + # np.testing.assert_array_equal( + # params[conv.output[0]] * A + B, + # params[bn.output[0]]) + + # Now, we have that the computation made is the following: + # ((X `conv` W) + b) * A + B + # Then, we can simply fuse this as follows: + # (X `conv` (W * A)) + b * A + B + # which is simply + # (X `conv` Q) + C + # where + + # Q = W * A + # C = b * A + B + + # For ConvTranspose, from the view of convolutions as a + # Toepeliz multiplication, we have W_ = W^T, so the weights + # are laid out as (R, S, K, K) (vs (S, R, K, K) for a Conv), + # so the weights broadcast slightly differently. Remember, our + # BN scale 'B' is of size (S,) + + A_ = A.reshape(-1, 1, 1, 1) if conv.type == "Conv" else \ + A.reshape(1, -1, 1, 1) + + C = conv_bias * A + B + Q = conv_weight * A_ + + assert params[conv.input[1]].shape == Q.shape + + params[conv.input[1]] = Q + if conv_has_bias: + assert params[conv.input[2]].shape == C.shape + params[conv.input[2]] = C + else: + # make af_bias to be bias of the conv layer + fused_conv.input.append(affine.input[2]) + params[affine.input[2]] = B + + new_ops = net.op[:i] + [fused_conv] + net.op[j + 1:] + del net.op[:] + if conv_has_bias: + del params[affine.input[2]] + removed_tensors.append(affine.input[2]) + removed_tensors.append(affine.input[1]) + del params[affine.input[1]] + net.op.extend(new_ops) + break + return net, params, removed_tensors + + +def fuse_affine(net, params, ignore_failure): + # Run until we hit a fixed point + removed_tensors = [] + while True: + (next_net, next_params, removed_tensors) = \ + fuse_first_affine(net, params, removed_tensors) + if len(next_net.op) == len(net.op): + if ( + any(op.type == "AffineChannel" for op in next_net.op) and + not ignore_failure + ): + raise Exception( + "Model contains AffineChannel op after fusion: %s", next_net) + return (next_net, next_params, removed_tensors) + net, params, removed_tensors = (next_net, next_params, removed_tensors) + + +def fuse_net(fuse_func, net, blobs, ignore_failure=False): + is_core_net = isinstance(net, core.Net) + if is_core_net: + net = net.Proto() + + net, params, removed_tensors = fuse_func(net, blobs, ignore_failure) + for rt in removed_tensors: + net.external_input.remove(rt) + + if is_core_net: + net = core.Net(net) + + return net, params + + +def fuse_net_affine(net, blobs): + return fuse_net(fuse_affine, net, blobs) + + +def add_tensor(net, name, blob): + ''' Create an operator to store the tensor 'blob', + run the operator to put the blob to workspace. + uint8 is stored as an array of string with one element. + ''' + kTypeNameMapper = { + np.dtype('float32'): "GivenTensorFill", + np.dtype('int32'): "GivenTensorIntFill", + np.dtype('int64'): "GivenTensorInt64Fill", + np.dtype('uint8'): "GivenTensorStringFill", + } + + shape = blob.shape + values = blob + # pass array of uint8 as a string to save storage + # storing uint8_t has a large overhead for now + if blob.dtype == np.dtype('uint8'): + shape = [1] + values = [str(blob.data)] + + op = core.CreateOperator( + kTypeNameMapper[blob.dtype], + [], [name], + shape=shape, + values=values, + # arg=[ + # putils.MakeArgument("shape", shape), + # putils.MakeArgument("values", values), + # ] + ) + net.op.extend([op]) + + +def gen_init_net_from_blobs(blobs, blobs_to_use=None, excluded_blobs=None): + ''' Generate an initialization net based on a blob dict ''' + ret = caffe2_pb2.NetDef() + if blobs_to_use is None: + blobs_to_use = {x for x in blobs} + else: + blobs_to_use = copy.deepcopy(blobs_to_use) + if excluded_blobs is not None: + blobs_to_use = [x for x in blobs_to_use if x not in excluded_blobs] + for name in blobs_to_use: + blob = blobs[name] + if isinstance(blob, str): + print('Blob {} with type {} is not supported in generating init net,' + ' skipped.'.format(name, type(blob))) + continue + add_tensor(ret, name, blob) + + return ret + + +def get_ws_blobs(blob_names=None): + ''' Get blobs in 'blob_names' in the default workspace, + get all blobs if blob_names is None ''' + blobs = {} + if blob_names is None: + blob_names = workspace.Blobs() + blobs = {x: workspace.FetchBlob(x) for x in blob_names} + + return blobs + + +def get_device_option_cpu(): + device_option = core.DeviceOption(caffe2_pb2.CPU) + return device_option + + +def get_device_option_cuda(gpu_id=0): + device_option = caffe2_pb2.DeviceOption() + device_option.device_type = caffe2_pb2.CUDA + device_option.device_id = gpu_id + return device_option + + +def create_input_blobs_for_net(net_def): + for op in net_def.op: + for blob_in in op.input: + if not workspace.HasBlob(blob_in): + workspace.CreateBlob(blob_in) + + +def compare_model(model1_func, model2_func, test_image, check_blobs): + ''' model_func(test_image, check_blobs) + ''' + cb1, cb2 = check_blobs, check_blobs + if isinstance(check_blobs, dict): + cb1 = check_blobs.keys() + cb2 = check_blobs.values() + print('Running the first model...') + res1 = model1_func(test_image, check_blobs) + print('Running the second model...') + res2 = model2_func(test_image, check_blobs) + for idx in range(len(cb1)): + print('Checking {} -> {}...'.format(cb1[idx], cb2[idx])) + n1, n2 = cb1[idx], cb2[idx] + r1 = res1[n1] if n1 in res1 else None + r2 = res2[n2] if n2 in res2 else None + assert r1 is not None or r2 is None, \ + "Blob {} in model1 is None".format(n1) + assert r2 is not None or r1 is None, \ + "Blob {} in model2 is None".format(n2) + assert r1.shape == r2.shape, \ + "Blob {} and {} shape mismatched: {} vs {}".format( + n1, n2, r1.shape, r2.shape) + + np.testing.assert_array_almost_equal( + r1, r2, decimal=3, + err_msg='{} and {} not matched. Max diff: {}'.format( + n1, n2, np.amax(np.absolute(r1 - r2)))) + + return True + + +# graph_name could not contain word 'graph' +def save_graph(net, file_name, graph_name="net", op_only=True): + from caffe2.python import net_drawer + graph = None + ops = net.op + if not op_only: + graph = net_drawer.GetPydotGraph( + ops, graph_name, + rankdir="TB") + else: + graph = net_drawer.GetPydotGraphMinimal( + ops, graph_name, + rankdir="TB", minimal_dependency=True) + + try: + graph.write_png(file_name) + except Exception as e: + print('Error when writing graph to image {}'.format(e)) diff --git a/detectron/utils/net.py b/detectron/utils/net.py new file mode 100644 index 0000000000000000000000000000000000000000..f98f02f4028e988f00d5c9fcc64fa0c1ecc292b7 --- /dev/null +++ b/detectron/utils/net.py @@ -0,0 +1,298 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Helper functions for working with Caffe2 networks (i.e., operator graphs).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from collections import OrderedDict +import logging +import numpy as np +import os +import pprint + +from caffe2.python import core +from caffe2.python import workspace + +from detectron.core.config import cfg +from detectron.core.config import load_cfg +from detectron.utils.io import load_object +from detectron.utils.io import save_object +import detectron.utils.c2 as c2_utils +import detectron.utils.env as envu + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def initialize_from_weights_file(model, weights_file, broadcast=True): + """Initialize a model from weights stored in a pickled dictionary. If + multiple GPUs are used, the loaded weights are synchronized on all GPUs, + unless 'broadcast' is False. + """ + initialize_gpu_from_weights_file(model, weights_file, gpu_id=0) + if broadcast: + broadcast_parameters(model) + + +def initialize_gpu_from_weights_file(model, weights_file, gpu_id=0): + """Initialize a network with ops on a specific GPU. + + If you use CUDA_VISIBLE_DEVICES to target specific GPUs, Caffe2 will + automatically map logical GPU ids (starting from 0) to the physical GPUs + specified in CUDA_VISIBLE_DEVICES. + """ + logger.info('Loading weights from: {}'.format(weights_file)) + ws_blobs = workspace.Blobs() + src_blobs = load_object(weights_file) + + if 'cfg' in src_blobs: + saved_cfg = load_cfg(src_blobs['cfg']) + configure_bbox_reg_weights(model, saved_cfg) + if 'blobs' in src_blobs: + # Backwards compat--dictionary used to be only blobs, now they are + # stored under the 'blobs' key + src_blobs = src_blobs['blobs'] + # Initialize weights on GPU gpu_id only + unscoped_param_names = OrderedDict() # Print these out in model order + for blob in model.params: + unscoped_param_names[c2_utils.UnscopeName(str(blob))] = True + with c2_utils.NamedCudaScope(gpu_id): + for unscoped_param_name in unscoped_param_names.keys(): + if (unscoped_param_name.find(']_') >= 0 and + unscoped_param_name not in src_blobs): + # Special case for sharing initialization from a pretrained + # model: + # If a blob named '_[xyz]_foo' is in model.params and not in + # the initialization blob dictionary, then load source blob + # 'foo' into destination blob '_[xyz]_foo' + src_name = unscoped_param_name[ + unscoped_param_name.find(']_') + 2:] + else: + src_name = unscoped_param_name + if src_name not in src_blobs: + logger.info('{:s} not found'.format(src_name)) + continue + dst_name = core.ScopedName(unscoped_param_name) + has_momentum = src_name + '_momentum' in src_blobs + has_momentum_str = ' [+ momentum]' if has_momentum else '' + logger.info( + '{:s}{:} loaded from weights file into {:s}: {}'.format( + src_name, has_momentum_str, dst_name, src_blobs[src_name] + .shape + ) + ) + if dst_name in ws_blobs: + # If the blob is already in the workspace, make sure that it + # matches the shape of the loaded blob + ws_blob = workspace.FetchBlob(dst_name) + assert ws_blob.shape == src_blobs[src_name].shape, \ + ('Workspace blob {} with shape {} does not match ' + 'weights file shape {}').format( + src_name, + ws_blob.shape, + src_blobs[src_name].shape) + workspace.FeedBlob( + dst_name, + src_blobs[src_name].astype(np.float32, copy=False)) + if has_momentum: + workspace.FeedBlob( + dst_name + '_momentum', + src_blobs[src_name + '_momentum'].astype( + np.float32, copy=False)) + + # We preserve blobs that are in the weights file but not used by the current + # model. We load these into CPU memory under the '__preserve__/' namescope. + # These blobs will be stored when saving a model to a weights file. This + # feature allows for alternating optimization of Faster R-CNN in which blobs + # unused by one step can still be preserved forward and used to initialize + # another step. + for src_name in src_blobs.keys(): + if (src_name not in unscoped_param_names and + not src_name.endswith('_momentum') and + src_blobs[src_name] is not None): + with c2_utils.CpuScope(): + workspace.FeedBlob( + '__preserve__/{:s}'.format(src_name), src_blobs[src_name]) + logger.info( + '{:s} preserved in workspace (unused)'.format(src_name)) + + +def save_model_to_weights_file(weights_file, model): + """Stash model weights in a dictionary and pickle them to a file. We map + GPU device scoped names to unscoped names (e.g., 'gpu_0/conv1_w' -> + 'conv1_w'). + """ + logger.info( + 'Saving parameters and momentum to {}'.format( + os.path.abspath(weights_file))) + blobs = {} + # Save all parameters + for param in model.params: + scoped_name = str(param) + unscoped_name = c2_utils.UnscopeName(scoped_name) + if unscoped_name not in blobs: + logger.debug(' {:s} -> {:s}'.format(scoped_name, unscoped_name)) + blobs[unscoped_name] = workspace.FetchBlob(scoped_name) + # Save momentum + for param in model.TrainableParams(): + scoped_name = str(param) + '_momentum' + unscoped_name = c2_utils.UnscopeName(scoped_name) + if unscoped_name not in blobs: + logger.debug(' {:s} -> {:s}'.format(scoped_name, unscoped_name)) + blobs[unscoped_name] = workspace.FetchBlob(scoped_name) + # Save preserved blobs + for scoped_name in workspace.Blobs(): + if scoped_name.startswith('__preserve__/'): + unscoped_name = c2_utils.UnscopeName(scoped_name) + if unscoped_name not in blobs: + logger.debug( + ' {:s} -> {:s} (preserved)'.format( + scoped_name, unscoped_name)) + blobs[unscoped_name] = workspace.FetchBlob(scoped_name) + cfg_yaml = envu.yaml_dump(cfg) + save_object(dict(blobs=blobs, cfg=cfg_yaml), weights_file) + + +def broadcast_parameters(model): + """Copy parameter blobs from GPU 0 over the corresponding parameter blobs + on GPUs 1 through cfg.NUM_GPUS - 1. + """ + if cfg.NUM_GPUS == 1: + # no-op if only running on a single GPU + return + + def _do_broadcast(all_blobs): + assert len(all_blobs) % cfg.NUM_GPUS == 0, \ + ('Unexpected value for NUM_GPUS. Make sure you are not ' + 'running single-GPU inference with NUM_GPUS > 1.') + blobs_per_gpu = int(len(all_blobs) / cfg.NUM_GPUS) + for i in range(blobs_per_gpu): + blobs = [p for p in all_blobs[i::blobs_per_gpu]] + data = workspace.FetchBlob(blobs[0]) + logger.debug('Broadcasting {} to'.format(str(blobs[0]))) + for i, p in enumerate(blobs[1:]): + logger.debug(' |-> {}'.format(str(p))) + with c2_utils.CudaScope(i + 1): + workspace.FeedBlob(p, data) + + _do_broadcast(model.params) + _do_broadcast([b + '_momentum' for b in model.TrainableParams()]) + + +def sum_multi_gpu_blob(blob_name): + """Return the sum of a scalar blob held on multiple GPUs.""" + val = 0 + for i in range(cfg.NUM_GPUS): + val += float(workspace.FetchBlob('gpu_{}/{}'.format(i, blob_name))) + return val + + +def average_multi_gpu_blob(blob_name): + """Return the average of a scalar blob held on multiple GPUs.""" + return sum_multi_gpu_blob(blob_name) / cfg.NUM_GPUS + + +def print_net(model, namescope='gpu_0'): + """Print the model network.""" + logger.info('Printing model: {}'.format(model.net.Name())) + op_list = model.net.Proto().op + for op in op_list: + input_name = op.input + # For simplicity: only print the first output + # Not recommended if there are split layers + output_name = str(op.output[0]) + op_type = op.type + op_name = op.name + + if namescope is None or output_name.startswith(namescope): + # Only print the forward pass network + if output_name.find('grad') >= 0 or output_name.find('__m') >= 0: + continue + + try: + # Under some conditions (e.g., dynamic memory optimization) + # it is possible that the network frees some blobs when they are + # no longer needed. Handle this case... + output_shape = workspace.FetchBlob(output_name).shape + except BaseException: + output_shape = '' + + first_blob = True + op_label = op_type + (op_name if op_name == '' else ':' + op_name) + suffix = ' ------- (op: {})'.format(op_label) + for j in range(len(input_name)): + if input_name[j] in model.params: + continue + input_blob = workspace.FetchBlob(input_name[j]) + if isinstance(input_blob, np.ndarray): + input_shape = input_blob.shape + logger.info('{:28s}: {:20s} => {:28s}: {:20s}{}'.format( + c2_utils.UnscopeName(str(input_name[j])), + '{}'.format(input_shape), + c2_utils.UnscopeName(str(output_name)), + '{}'.format(output_shape), + suffix)) + if first_blob: + first_blob = False + suffix = ' ------|' + logger.info('End of model: {}'.format(model.net.Name())) + + +def configure_bbox_reg_weights(model, saved_cfg): + """Compatibility for old models trained with bounding box regression + mean/std normalization (instead of fixed weights). + """ + if 'MODEL' not in saved_cfg or 'BBOX_REG_WEIGHTS' not in saved_cfg.MODEL: + logger.warning('Model from weights file was trained before config key ' + 'MODEL.BBOX_REG_WEIGHTS was added. Forcing ' + 'MODEL.BBOX_REG_WEIGHTS = (1., 1., 1., 1.) to ensure ' + 'correct **inference** behavior.') + # Generally we don't allow modifying the config, but this is a one-off + # hack to support some very old models + is_immutable = cfg.is_immutable() + cfg.immutable(False) + cfg.MODEL.BBOX_REG_WEIGHTS = (1., 1., 1., 1.) + cfg.immutable(is_immutable) + logger.info('New config:') + logger.info(pprint.pformat(cfg)) + assert not model.train, ( + 'This model was trained with an older version of the code that ' + 'used bounding box regression mean/std normalization. It can no ' + 'longer be used for training. To upgrade it to a trainable model ' + 'please use fb/compat/convert_bbox_reg_normalized_model.py.' + ) + + +def get_group_gn(dim): + """ + get number of groups used by GroupNorm, based on number of channels + """ + dim_per_gp = cfg.GROUP_NORM.DIM_PER_GP + num_groups = cfg.GROUP_NORM.NUM_GROUPS + + assert dim_per_gp == -1 or num_groups == -1, \ + "GroupNorm: can only specify G or C/G." + + if dim_per_gp > 0: + assert dim % dim_per_gp == 0 + group_gn = dim // dim_per_gp + else: + assert dim % num_groups == 0 + group_gn = num_groups + return group_gn diff --git a/detectron/utils/segms.py b/detectron/utils/segms.py new file mode 100644 index 0000000000000000000000000000000000000000..4620a3592a5a7c6e3de4e2b4af05ed90ca14a5ec --- /dev/null +++ b/detectron/utils/segms.py @@ -0,0 +1,279 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Functions for interacting with segmentation masks in the COCO format. + +The following terms are used in this module + mask: a binary mask encoded as a 2D numpy array + segm: a segmentation mask in one of the two COCO formats (polygon or RLE) + polygon: COCO's polygon format + RLE: COCO's run length encoding format +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import numpy as np + +import pycocotools.mask as mask_util + +# Type used for storing masks in polygon format +_POLY_TYPE = list +# Type used for storing masks in RLE format +_RLE_TYPE = dict + + +def is_poly(segm): + """Determine if segm is a polygon. Valid segm expected (polygon or RLE).""" + assert isinstance(segm, (_POLY_TYPE, _RLE_TYPE)), \ + 'Invalid segm type: {}'.format(type(segm)) + return isinstance(segm, _POLY_TYPE) + + +def flip_segms(segms, height, width): + """Left/right flip each mask in a list of masks.""" + def _flip_poly(poly, width): + flipped_poly = np.array(poly) + flipped_poly[0::2] = width - np.array(poly[0::2]) - 1 + return flipped_poly.tolist() + + def _flip_rle(rle, height, width): + if 'counts' in rle and type(rle['counts']) == list: + # Magic RLE format handling painfully discovered by looking at the + # COCO API showAnns function. + rle = mask_util.frPyObjects([rle], height, width) + mask = mask_util.decode(rle) + mask = mask[:, ::-1, :] + rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) + return rle + + flipped_segms = [] + for segm in segms: + if is_poly(segm): + # Polygon format + flipped_segms.append([_flip_poly(poly, width) for poly in segm]) + else: + # RLE format + flipped_segms.append(_flip_rle(segm, height, width)) + return flipped_segms + + +def polys_to_mask(polygons, height, width): + """Convert from the COCO polygon segmentation format to a binary mask + encoded as a 2D array of data type numpy.float32. The polygon segmentation + is understood to be enclosed inside a height x width image. The resulting + mask is therefore of shape (height, width). + """ + rle = mask_util.frPyObjects(polygons, height, width) + mask = np.array(mask_util.decode(rle), dtype=np.float32) + # Flatten in case polygons was a list + mask = np.sum(mask, axis=2) + mask = np.array(mask > 0, dtype=np.float32) + return mask + + +def mask_to_bbox(mask): + """Compute the tight bounding box of a binary mask.""" + xs = np.where(np.sum(mask, axis=0) > 0)[0] + ys = np.where(np.sum(mask, axis=1) > 0)[0] + + if len(xs) == 0 or len(ys) == 0: + return None + + x0 = xs[0] + x1 = xs[-1] + y0 = ys[0] + y1 = ys[-1] + return np.array((x0, y0, x1, y1), dtype=np.float32) + + +def polys_to_mask_wrt_box(polygons, box, M): + """Convert from the COCO polygon segmentation format to a binary mask + encoded as a 2D array of data type numpy.float32. The polygon segmentation + is understood to be enclosed in the given box and rasterized to an M x M + mask. The resulting mask is therefore of shape (M, M). + """ + w = box[2] - box[0] + h = box[3] - box[1] + + w = np.maximum(w, 1) + h = np.maximum(h, 1) + + polygons_norm = [] + for poly in polygons: + p = np.array(poly, dtype=np.float32) + p[0::2] = (p[0::2] - box[0]) * M / w + p[1::2] = (p[1::2] - box[1]) * M / h + polygons_norm.append(p) + + rle = mask_util.frPyObjects(polygons_norm, M, M) + mask = np.array(mask_util.decode(rle), dtype=np.float32) + # Flatten in case polygons was a list + mask = np.sum(mask, axis=2) + mask = np.array(mask > 0, dtype=np.float32) + return mask + + +def polys_to_boxes(polys): + """Convert a list of polygons into an array of tight bounding boxes.""" + boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32) + for i in range(len(polys)): + poly = polys[i] + x0 = min(min(p[::2]) for p in poly) + x1 = max(max(p[::2]) for p in poly) + y0 = min(min(p[1::2]) for p in poly) + y1 = max(max(p[1::2]) for p in poly) + boxes_from_polys[i, :] = [x0, y0, x1, y1] + + return boxes_from_polys + + +def rle_mask_voting( + top_masks, all_masks, all_dets, iou_thresh, binarize_thresh, method='AVG' +): + """Returns new masks (in correspondence with `top_masks`) by combining + multiple overlapping masks coming from the pool of `all_masks`. Two methods + for combining masks are supported: 'AVG' uses a weighted average of + overlapping mask pixels; 'UNION' takes the union of all mask pixels. + """ + if len(top_masks) == 0: + return + + all_not_crowd = [False] * len(all_masks) + top_to_all_overlaps = mask_util.iou(top_masks, all_masks, all_not_crowd) + decoded_all_masks = [ + np.array(mask_util.decode(rle), dtype=np.float32) for rle in all_masks + ] + decoded_top_masks = [ + np.array(mask_util.decode(rle), dtype=np.float32) for rle in top_masks + ] + all_boxes = all_dets[:, :4].astype(np.int32) + all_scores = all_dets[:, 4] + + # Fill box support with weights + mask_shape = decoded_all_masks[0].shape + mask_weights = np.zeros((len(all_masks), mask_shape[0], mask_shape[1])) + for k in range(len(all_masks)): + ref_box = all_boxes[k] + x_0 = max(ref_box[0], 0) + x_1 = min(ref_box[2] + 1, mask_shape[1]) + y_0 = max(ref_box[1], 0) + y_1 = min(ref_box[3] + 1, mask_shape[0]) + mask_weights[k, y_0:y_1, x_0:x_1] = all_scores[k] + mask_weights = np.maximum(mask_weights, 1e-5) + + top_segms_out = [] + for k in range(len(top_masks)): + # Corner case of empty mask + if decoded_top_masks[k].sum() == 0: + top_segms_out.append(top_masks[k]) + continue + + inds_to_vote = np.where(top_to_all_overlaps[k] >= iou_thresh)[0] + # Only matches itself + if len(inds_to_vote) == 1: + top_segms_out.append(top_masks[k]) + continue + + masks_to_vote = [decoded_all_masks[i] for i in inds_to_vote] + if method == 'AVG': + ws = mask_weights[inds_to_vote] + soft_mask = np.average(masks_to_vote, axis=0, weights=ws) + mask = np.array(soft_mask > binarize_thresh, dtype=np.uint8) + elif method == 'UNION': + # Any pixel that's on joins the mask + soft_mask = np.sum(masks_to_vote, axis=0) + mask = np.array(soft_mask > 1e-5, dtype=np.uint8) + else: + raise NotImplementedError('Method {} is unknown'.format(method)) + rle = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0] + top_segms_out.append(rle) + + return top_segms_out + + +def rle_mask_nms(masks, dets, thresh, mode='IOU'): + """Performs greedy non-maximum suppression based on an overlap measurement + between masks. The type of measurement is determined by `mode` and can be + either 'IOU' (standard intersection over union) or 'IOMA' (intersection over + mininum area). + """ + if len(masks) == 0: + return [] + if len(masks) == 1: + return [0] + + if mode == 'IOU': + # Computes ious[m1, m2] = area(intersect(m1, m2)) / area(union(m1, m2)) + all_not_crowds = [False] * len(masks) + ious = mask_util.iou(masks, masks, all_not_crowds) + elif mode == 'IOMA': + # Computes ious[m1, m2] = area(intersect(m1, m2)) / min(area(m1), area(m2)) + all_crowds = [True] * len(masks) + # ious[m1, m2] = area(intersect(m1, m2)) / area(m2) + ious = mask_util.iou(masks, masks, all_crowds) + # ... = max(area(intersect(m1, m2)) / area(m2), + # area(intersect(m2, m1)) / area(m1)) + ious = np.maximum(ious, ious.transpose()) + elif mode == 'CONTAINMENT': + # Computes ious[m1, m2] = area(intersect(m1, m2)) / area(m2) + # Which measures how much m2 is contained inside m1 + all_crowds = [True] * len(masks) + ious = mask_util.iou(masks, masks, all_crowds) + else: + raise NotImplementedError('Mode {} is unknown'.format(mode)) + + scores = dets[:, 4] + order = np.argsort(-scores) + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + ovr = ious[i, order[1:]] + inds_to_keep = np.where(ovr <= thresh)[0] + order = order[inds_to_keep + 1] + + return keep + + +def rle_masks_to_boxes(masks): + """Computes the bounding box of each mask in a list of RLE encoded masks.""" + if len(masks) == 0: + return [] + + decoded_masks = [ + np.array(mask_util.decode(rle), dtype=np.float32) for rle in masks + ] + + def get_bounds(flat_mask): + inds = np.where(flat_mask > 0)[0] + return inds.min(), inds.max() + + boxes = np.zeros((len(decoded_masks), 4)) + keep = [True] * len(decoded_masks) + for i, mask in enumerate(decoded_masks): + if mask.sum() == 0: + keep[i] = False + continue + flat_mask = mask.sum(axis=0) + x0, x1 = get_bounds(flat_mask) + flat_mask = mask.sum(axis=1) + y0, y1 = get_bounds(flat_mask) + boxes[i, :] = (x0, y0, x1, y1) + + return boxes, np.where(keep)[0] diff --git a/detectron/utils/subprocess.py b/detectron/utils/subprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..c7911886a32d43bfbb41d9356ffcfa9d40d7feb4 --- /dev/null +++ b/detectron/utils/subprocess.py @@ -0,0 +1,133 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Primitives for running multiple single-GPU jobs in parallel over subranges of +data. These are used for running multi-GPU inference. Subprocesses are used to +avoid the GIL since inference may involve non-trivial amounts of Python code. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import os +import numpy as np +import subprocess +from six.moves import shlex_quote + +from detectron.core.config import cfg +from detectron.utils.io import load_object +import detectron.utils.env as envu + +import logging +logger = logging.getLogger(__name__) + + +def process_in_parallel( + tag, total_range_size, binary, output_dir, opts='' +): + """Run the specified binary cfg.NUM_GPUS times in parallel, each time as a + subprocess that uses one GPU. The binary must accept the command line + arguments `--range {start} {end}` that specify a data processing range. + """ + # Snapshot the current cfg state in order to pass to the inference + # subprocesses + cfg_file = os.path.join(output_dir, '{}_range_config.yaml'.format(tag)) + with open(cfg_file, 'w') as f: + envu.yaml_dump(cfg, stream=f) + subprocess_env = os.environ.copy() + processes = [] + subinds = np.array_split(range(total_range_size), cfg.NUM_GPUS) + # Determine GPUs to use + cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES') + if cuda_visible_devices: + gpu_inds = map(int, cuda_visible_devices.split(',')) + assert -1 not in gpu_inds, \ + 'Hiding GPU indices using the \'-1\' index is not supported' + else: + gpu_inds = range(cfg.NUM_GPUS) + # Run the binary in cfg.NUM_GPUS subprocesses + for i, gpu_ind in enumerate(gpu_inds): + start = subinds[i][0] + end = subinds[i][-1] + 1 + subprocess_env['CUDA_VISIBLE_DEVICES'] = str(gpu_ind) + cmd = '{binary} --range {start} {end} --cfg {cfg_file} NUM_GPUS 1 {opts}' + cmd = cmd.format( + binary=shlex_quote(binary), + start=int(start), + end=int(end), + cfg_file=shlex_quote(cfg_file), + opts=' '.join([shlex_quote(opt) for opt in opts]) + ) + logger.info('{} range command {}: {}'.format(tag, i, cmd)) + if i == 0: + subprocess_stdout = subprocess.PIPE + else: + filename = os.path.join( + output_dir, '%s_range_%s_%s.stdout' % (tag, start, end) + ) + subprocess_stdout = open(filename, 'w') # NOQA (close below) + p = subprocess.Popen( + cmd, + shell=True, + env=subprocess_env, + stdout=subprocess_stdout, + stderr=subprocess.STDOUT, + bufsize=1 + ) + processes.append((i, p, start, end, subprocess_stdout)) + # Log output from inference processes and collate their results + outputs = [] + for i, p, start, end, subprocess_stdout in processes: + log_subprocess_output(i, p, output_dir, tag, start, end) + if i > 0: + subprocess_stdout.close() + range_file = os.path.join( + output_dir, '%s_range_%s_%s.pkl' % (tag, start, end) + ) + range_data = load_object(range_file) + outputs.append(range_data) + return outputs + + +def log_subprocess_output(i, p, output_dir, tag, start, end): + """Capture the output of each subprocess and log it in the parent process. + The first subprocess's output is logged in realtime. The output from the + other subprocesses is buffered and then printed all at once (in order) when + subprocesses finish. + """ + outfile = os.path.join( + output_dir, '%s_range_%s_%s.stdout' % (tag, start, end) + ) + logger.info('# ' + '-' * 76 + ' #') + logger.info( + 'stdout of subprocess %s with range [%s, %s]' % (i, start + 1, end) + ) + logger.info('# ' + '-' * 76 + ' #') + if i == 0: + # Stream the piped stdout from the first subprocess in realtime + with open(outfile, 'wb') as f: + for line in iter(p.stdout.readline, b''): + print(line.rstrip().decode("utf8")) + f.write(line) + p.stdout.close() + ret = p.wait() + else: + # For subprocesses >= 1, wait and dump their log file + ret = p.wait() + with open(outfile, 'r') as f: + print(''.join(f.readlines())) + assert ret == 0, 'Range subprocess failed (exit code: {})'.format(ret) diff --git a/detectron/utils/timer.py b/detectron/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..69a20dbde18434ff3b4015102efb5a1c4f95d53b --- /dev/null +++ b/detectron/utils/timer.py @@ -0,0 +1,60 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Timing related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import time + + +class Timer(object): + """A simple timer.""" + + def __init__(self): + self.reset() + + def tic(self): + # using time.time instead of time.clock because time time.clock + # does not normalize for multithreading + self.start_time = time.time() + + def toc(self, average=True): + self.diff = time.time() - self.start_time + self.total_time += self.diff + self.calls += 1 + self.average_time = self.total_time / self.calls + if average: + return self.average_time + else: + return self.diff + + def reset(self): + self.total_time = 0. + self.calls = 0 + self.start_time = 0. + self.diff = 0. + self.average_time = 0. diff --git a/detectron/utils/train.py b/detectron/utils/train.py new file mode 100644 index 0000000000000000000000000000000000000000..d5f739334146acd6a88e40b360143f4fc997e476 --- /dev/null +++ b/detectron/utils/train.py @@ -0,0 +1,206 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +"""Utilities driving the train_net binary""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from shutil import copyfile +import cv2 # NOQA (Must import before importing caffe2 due to bug in cv2) +import logging +import numpy as np +import os +import re + +from caffe2.python import memonger +from caffe2.python import workspace + +from detectron.core.config import cfg +from detectron.core.config import get_output_dir +from detectron.datasets.roidb import combined_roidb_for_training +from detectron.modeling import model_builder +from detectron.utils import lr_policy +from detectron.utils.training_stats import TrainingStats +import detectron.utils.env as envu +import detectron.utils.net as nu + + +def train_model(): + """Model training loop.""" + model, weights_file, start_iter, checkpoints, output_dir = create_model() + if 'final' in checkpoints: + # The final model was found in the output directory, so nothing to do + return checkpoints + + setup_model_for_training(model, weights_file, output_dir) + training_stats = TrainingStats(model) + CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) + + for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): + if model.roi_data_loader.has_stopped(): + handle_critical_error(model, 'roi_data_loader failed') + training_stats.IterTic() + lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) + workspace.RunNet(model.net.Proto().name) + if cur_iter == start_iter: + nu.print_net(model) + training_stats.IterToc() + training_stats.UpdateIterStats() + training_stats.LogIterStats(cur_iter, lr) + + if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: + checkpoints[cur_iter] = os.path.join( + output_dir, 'model_iter{}.pkl'.format(cur_iter) + ) + nu.save_model_to_weights_file(checkpoints[cur_iter], model) + + if cur_iter == start_iter + training_stats.LOG_PERIOD: + # Reset the iteration timer to remove outliers from the first few + # SGD iterations + training_stats.ResetIterTimer() + + if np.isnan(training_stats.iter_total_loss): + handle_critical_error(model, 'Loss is NaN') + + # Save the final model + checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') + nu.save_model_to_weights_file(checkpoints['final'], model) + # Shutdown data loading threads + model.roi_data_loader.shutdown() + return checkpoints + + +def handle_critical_error(model, msg): + logger = logging.getLogger(__name__) + logger.critical(msg) + model.roi_data_loader.shutdown() + raise Exception(msg) + + +def create_model(): + """Build the model and look for saved model checkpoints in case we can + resume from one. + """ + logger = logging.getLogger(__name__) + start_iter = 0 + checkpoints = {} + output_dir = get_output_dir(cfg.TRAIN.DATASETS, training=True) + weights_file = cfg.TRAIN.WEIGHTS + if cfg.TRAIN.AUTO_RESUME: + # Check for the final model (indicates training already finished) + final_path = os.path.join(output_dir, 'model_final.pkl') + if os.path.exists(final_path): + logger.info('model_final.pkl exists; no need to train!') + return None, None, None, {'final': final_path}, output_dir + + if cfg.TRAIN.COPY_WEIGHTS: + copyfile( + weights_file, + os.path.join(output_dir, os.path.basename(weights_file))) + logger.info('Copy {} to {}'.format(weights_file, output_dir)) + + # Find the most recent checkpoint (highest iteration number) + files = os.listdir(output_dir) + for f in files: + iter_string = re.findall(r'(?<=model_iter)\d+(?=\.pkl)', f) + if len(iter_string) > 0: + checkpoint_iter = int(iter_string[0]) + if checkpoint_iter > start_iter: + # Start one iteration immediately after the checkpoint iter + start_iter = checkpoint_iter + 1 + resume_weights_file = f + + if start_iter > 0: + # Override the initialization weights with the found checkpoint + weights_file = os.path.join(output_dir, resume_weights_file) + logger.info( + '========> Resuming from checkpoint {} at start iter {}'. + format(weights_file, start_iter) + ) + + logger.info('Building model: {}'.format(cfg.MODEL.TYPE)) + model = model_builder.create(cfg.MODEL.TYPE, train=True) + if cfg.MEMONGER: + optimize_memory(model) + # Performs random weight initialization as defined by the model + workspace.RunNetOnce(model.param_init_net) + return model, weights_file, start_iter, checkpoints, output_dir + + +def optimize_memory(model): + """Save GPU memory through blob sharing.""" + for device in range(cfg.NUM_GPUS): + namescope = 'gpu_{}/'.format(device) + losses = [namescope + l for l in model.losses] + model.net._net = memonger.share_grad_blobs( + model.net, + losses, + set(model.param_to_grad.values()), + namescope, + share_activations=cfg.MEMONGER_SHARE_ACTIVATIONS + ) + + +def setup_model_for_training(model, weights_file, output_dir): + """Loaded saved weights and create the network in the C2 workspace.""" + logger = logging.getLogger(__name__) + add_model_training_inputs(model) + + if weights_file: + # Override random weight initialization with weights from a saved model + nu.initialize_gpu_from_weights_file(model, weights_file, gpu_id=0) + # Even if we're randomly initializing we still need to synchronize + # parameters across GPUs + nu.broadcast_parameters(model) + workspace.CreateNet(model.net) + + logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir))) + dump_proto_files(model, output_dir) + + # Start loading mini-batches and enqueuing blobs + model.roi_data_loader.register_sigint_handler() + model.roi_data_loader.start(prefill=True) + return output_dir + + +def add_model_training_inputs(model): + """Load the training dataset and attach the training inputs to the model.""" + logger = logging.getLogger(__name__) + logger.info('Loading dataset: {}'.format(cfg.TRAIN.DATASETS)) + roidb = combined_roidb_for_training( + cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES + ) + logger.info('{:d} roidb entries'.format(len(roidb))) + model_builder.add_training_inputs(model, roidb=roidb) + + +def dump_proto_files(model, output_dir): + """Save prototxt descriptions of the training network and parameter + initialization network.""" + with open(os.path.join(output_dir, 'net.pbtxt'), 'w') as fid: + fid.write(str(model.net.Proto())) + with open(os.path.join(output_dir, 'param_init_net.pbtxt'), 'w') as fid: + fid.write(str(model.param_init_net.Proto())) diff --git a/detectron/utils/training_stats.py b/detectron/utils/training_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..36e586ef83d2a7656bd055053391fa651e62cc90 --- /dev/null +++ b/detectron/utils/training_stats.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Utilities for training.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import datetime +import numpy as np + +from caffe2.python import utils as c2_py_utils + +from detectron.core.config import cfg +from detectron.utils.logging import log_json_stats +from detectron.utils.logging import SmoothedValue +from detectron.utils.timer import Timer +import detectron.utils.net as nu + + +class TrainingStats(object): + """Track vital training statistics.""" + + def __init__(self, model): + # Window size for smoothing tracked values (with median filtering) + self.WIN_SZ = 20 + # Output logging period in SGD iterations + self.LOG_PERIOD = 20 + self.smoothed_losses_and_metrics = { + key: SmoothedValue(self.WIN_SZ) + for key in model.losses + model.metrics + } + self.losses_and_metrics = { + key: 0 + for key in model.losses + model.metrics + } + self.smoothed_total_loss = SmoothedValue(self.WIN_SZ) + self.smoothed_mb_qsize = SmoothedValue(self.WIN_SZ) + self.iter_total_loss = np.nan + self.iter_timer = Timer() + self.model = model + + def IterTic(self): + self.iter_timer.tic() + + def IterToc(self): + return self.iter_timer.toc(average=False) + + def ResetIterTimer(self): + self.iter_timer.reset() + + def UpdateIterStats(self): + """Update tracked iteration statistics.""" + for k in self.losses_and_metrics.keys(): + if k in self.model.losses: + self.losses_and_metrics[k] = nu.sum_multi_gpu_blob(k) + else: + self.losses_and_metrics[k] = nu.average_multi_gpu_blob(k) + for k, v in self.smoothed_losses_and_metrics.items(): + v.AddValue(self.losses_and_metrics[k]) + self.iter_total_loss = np.sum( + np.array([self.losses_and_metrics[k] for k in self.model.losses]) + ) + self.smoothed_total_loss.AddValue(self.iter_total_loss) + self.smoothed_mb_qsize.AddValue( + self.model.roi_data_loader._minibatch_queue.qsize() + ) + + def LogIterStats(self, cur_iter, lr): + """Log the tracked statistics.""" + if (cur_iter % self.LOG_PERIOD == 0 or + cur_iter == cfg.SOLVER.MAX_ITER - 1): + stats = self.GetStats(cur_iter, lr) + log_json_stats(stats) + + def GetStats(self, cur_iter, lr): + eta_seconds = self.iter_timer.average_time * ( + cfg.SOLVER.MAX_ITER - cur_iter + ) + eta = str(datetime.timedelta(seconds=int(eta_seconds))) + mem_stats = c2_py_utils.GetGPUMemoryUsageStats() + mem_usage = np.max(mem_stats['max_by_gpu'][:cfg.NUM_GPUS]) + stats = dict( + iter=cur_iter, + lr=float(lr), + time=self.iter_timer.average_time, + loss=self.smoothed_total_loss.GetMedianValue(), + eta=eta, + mb_qsize=int( + np.round(self.smoothed_mb_qsize.GetMedianValue()) + ), + mem=int(np.ceil(mem_usage / 1024 / 1024)) + ) + for k, v in self.smoothed_losses_and_metrics.items(): + stats[k] = v.GetMedianValue() + return stats diff --git a/detectron/utils/vis.py b/detectron/utils/vis.py new file mode 100644 index 0000000000000000000000000000000000000000..0234a1dd68821040160e91c4a4be5c392857e7aa --- /dev/null +++ b/detectron/utils/vis.py @@ -0,0 +1,394 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +"""Detection output visualization module.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import cv2 +import numpy as np +import os + +import pycocotools.mask as mask_util + +from detectron.utils.colormap import colormap +import detectron.utils.env as envu +import detectron.utils.keypoints as keypoint_utils + +# Matplotlib requires certain adjustments in some environments +# Must happen before importing matplotlib +envu.set_up_matplotlib() +import matplotlib.pyplot as plt +from matplotlib.patches import Polygon + +plt.rcParams['pdf.fonttype'] = 42 # For editing in Adobe Illustrator + + +_GRAY = (218, 227, 218) +_GREEN = (18, 127, 15) +_WHITE = (255, 255, 255) + + +def kp_connections(keypoints): + kp_lines = [ + [keypoints.index('left_eye'), keypoints.index('right_eye')], + [keypoints.index('left_eye'), keypoints.index('nose')], + [keypoints.index('right_eye'), keypoints.index('nose')], + [keypoints.index('right_eye'), keypoints.index('right_ear')], + [keypoints.index('left_eye'), keypoints.index('left_ear')], + [keypoints.index('right_shoulder'), keypoints.index('right_elbow')], + [keypoints.index('right_elbow'), keypoints.index('right_wrist')], + [keypoints.index('left_shoulder'), keypoints.index('left_elbow')], + [keypoints.index('left_elbow'), keypoints.index('left_wrist')], + [keypoints.index('right_hip'), keypoints.index('right_knee')], + [keypoints.index('right_knee'), keypoints.index('right_ankle')], + [keypoints.index('left_hip'), keypoints.index('left_knee')], + [keypoints.index('left_knee'), keypoints.index('left_ankle')], + [keypoints.index('right_shoulder'), keypoints.index('left_shoulder')], + [keypoints.index('right_hip'), keypoints.index('left_hip')], + ] + return kp_lines + + +def convert_from_cls_format(cls_boxes, cls_segms, cls_keyps): + """Convert from the class boxes/segms/keyps format generated by the testing + code. + """ + box_list = [b for b in cls_boxes if len(b) > 0] + if len(box_list) > 0: + boxes = np.concatenate(box_list) + else: + boxes = None + if cls_segms is not None: + segms = [s for slist in cls_segms for s in slist] + else: + segms = None + if cls_keyps is not None: + keyps = [k for klist in cls_keyps for k in klist] + else: + keyps = None + classes = [] + for j in range(len(cls_boxes)): + classes += [j] * len(cls_boxes[j]) + return boxes, segms, keyps, classes + + +def get_class_string(class_index, score, dataset): + class_text = dataset.classes[class_index] if dataset is not None else \ + 'id{:d}'.format(class_index) + return class_text + ' {:0.2f}'.format(score).lstrip('0') + + +def vis_mask(img, mask, col, alpha=0.4, show_border=True, border_thick=1): + """Visualizes a single binary mask.""" + + img = img.astype(np.float32) + idx = np.nonzero(mask) + + img[idx[0], idx[1], :] *= 1.0 - alpha + img[idx[0], idx[1], :] += alpha * col + + if show_border: + contours = cv2.findContours( + mask.copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)[-2] + cv2.drawContours(img, contours, -1, _WHITE, border_thick, cv2.LINE_AA) + + return img.astype(np.uint8) + + +def vis_class(img, pos, class_str, font_scale=0.35): + """Visualizes the class.""" + img = img.astype(np.uint8) + x0, y0 = int(pos[0]), int(pos[1]) + # Compute text size. + txt = class_str + font = cv2.FONT_HERSHEY_SIMPLEX + ((txt_w, txt_h), _) = cv2.getTextSize(txt, font, font_scale, 1) + # Place text background. + back_tl = x0, y0 - int(1.3 * txt_h) + back_br = x0 + txt_w, y0 + cv2.rectangle(img, back_tl, back_br, _GREEN, -1) + # Show text. + txt_tl = x0, y0 - int(0.3 * txt_h) + cv2.putText(img, txt, txt_tl, font, font_scale, _GRAY, lineType=cv2.LINE_AA) + return img + + +def vis_bbox(img, bbox, thick=1): + """Visualizes a bounding box.""" + img = img.astype(np.uint8) + (x0, y0, w, h) = bbox + x1, y1 = int(x0 + w), int(y0 + h) + x0, y0 = int(x0), int(y0) + cv2.rectangle(img, (x0, y0), (x1, y1), _GREEN, thickness=thick) + return img + + +def vis_keypoints(img, kps, kp_thresh=2, alpha=0.7): + """Visualizes keypoints (adapted from vis_one_image). + kps has shape (4, #keypoints) where 4 rows are (x, y, logit, prob). + """ + dataset_keypoints, _ = keypoint_utils.get_keypoints() + kp_lines = kp_connections(dataset_keypoints) + + # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv. + cmap = plt.get_cmap('rainbow') + colors = [cmap(i) for i in np.linspace(0, 1, len(kp_lines) + 2)] + colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors] + + # Perform the drawing on a copy of the image, to allow for blending. + kp_mask = np.copy(img) + + # Draw mid shoulder / mid hip first for better visualization. + mid_shoulder = ( + kps[:2, dataset_keypoints.index('right_shoulder')] + + kps[:2, dataset_keypoints.index('left_shoulder')]) / 2.0 + sc_mid_shoulder = np.minimum( + kps[2, dataset_keypoints.index('right_shoulder')], + kps[2, dataset_keypoints.index('left_shoulder')]) + mid_hip = ( + kps[:2, dataset_keypoints.index('right_hip')] + + kps[:2, dataset_keypoints.index('left_hip')]) / 2.0 + sc_mid_hip = np.minimum( + kps[2, dataset_keypoints.index('right_hip')], + kps[2, dataset_keypoints.index('left_hip')]) + nose_idx = dataset_keypoints.index('nose') + if sc_mid_shoulder > kp_thresh and kps[2, nose_idx] > kp_thresh: + cv2.line( + kp_mask, tuple(mid_shoulder), tuple(kps[:2, nose_idx]), + color=colors[len(kp_lines)], thickness=2, lineType=cv2.LINE_AA) + if sc_mid_shoulder > kp_thresh and sc_mid_hip > kp_thresh: + cv2.line( + kp_mask, tuple(mid_shoulder), tuple(mid_hip), + color=colors[len(kp_lines) + 1], thickness=2, lineType=cv2.LINE_AA) + + # Draw the keypoints. + for l in range(len(kp_lines)): + i1 = kp_lines[l][0] + i2 = kp_lines[l][1] + p1 = kps[0, i1], kps[1, i1] + p2 = kps[0, i2], kps[1, i2] + if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh: + cv2.line( + kp_mask, p1, p2, + color=colors[l], thickness=2, lineType=cv2.LINE_AA) + if kps[2, i1] > kp_thresh: + cv2.circle( + kp_mask, p1, + radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) + if kps[2, i2] > kp_thresh: + cv2.circle( + kp_mask, p2, + radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) + + # Blend the keypoints. + return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0) + + +def vis_one_image_opencv( + im, boxes, segms=None, keypoints=None, thresh=0.9, kp_thresh=2, + show_box=False, dataset=None, show_class=False): + """Constructs a numpy array with the detections visualized.""" + + if isinstance(boxes, list): + boxes, segms, keypoints, classes = convert_from_cls_format( + boxes, segms, keypoints) + + if boxes is None or boxes.shape[0] == 0 or max(boxes[:, 4]) < thresh: + return im + + if segms is not None and len(segms) > 0: + masks = mask_util.decode(segms) + color_list = colormap() + mask_color_id = 0 + + # Display in largest to smallest order to reduce occlusion + areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + sorted_inds = np.argsort(-areas) + + for i in sorted_inds: + bbox = boxes[i, :4] + score = boxes[i, -1] + if score < thresh: + continue + + # show box (off by default) + if show_box: + im = vis_bbox( + im, (bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1])) + + # show class (off by default) + if show_class: + class_str = get_class_string(classes[i], score, dataset) + im = vis_class(im, (bbox[0], bbox[1] - 2), class_str) + + # show mask + if segms is not None and len(segms) > i: + color_mask = color_list[mask_color_id % len(color_list), 0:3] + mask_color_id += 1 + im = vis_mask(im, masks[..., i], color_mask) + + # show keypoints + if keypoints is not None and len(keypoints) > i: + im = vis_keypoints(im, keypoints[i], kp_thresh) + + return im + + +def vis_one_image( + im, im_name, output_dir, boxes, segms=None, keypoints=None, thresh=0.9, + kp_thresh=2, dpi=200, box_alpha=0.0, dataset=None, show_class=False, + ext='pdf', out_when_no_box=False): + """Visual debugging of detections.""" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if isinstance(boxes, list): + boxes, segms, keypoints, classes = convert_from_cls_format( + boxes, segms, keypoints) + + if (boxes is None or boxes.shape[0] == 0 or max(boxes[:, 4]) < thresh) and not out_when_no_box: + return + + dataset_keypoints, _ = keypoint_utils.get_keypoints() + + if segms is not None and len(segms) > 0: + masks = mask_util.decode(segms) + + color_list = colormap(rgb=True) / 255 + + kp_lines = kp_connections(dataset_keypoints) + cmap = plt.get_cmap('rainbow') + colors = [cmap(i) for i in np.linspace(0, 1, len(kp_lines) + 2)] + + fig = plt.figure(frameon=False) + fig.set_size_inches(im.shape[1] / dpi, im.shape[0] / dpi) + ax = plt.Axes(fig, [0., 0., 1., 1.]) + ax.axis('off') + fig.add_axes(ax) + ax.imshow(im) + + if boxes is None: + sorted_inds = [] # avoid crash when 'boxes' is None + else: + # Display in largest to smallest order to reduce occlusion + areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + sorted_inds = np.argsort(-areas) + + mask_color_id = 0 + for i in sorted_inds: + bbox = boxes[i, :4] + score = boxes[i, -1] + if score < thresh: + continue + + # show box (off by default) + ax.add_patch( + plt.Rectangle((bbox[0], bbox[1]), + bbox[2] - bbox[0], + bbox[3] - bbox[1], + fill=False, edgecolor='g', + linewidth=0.5, alpha=box_alpha)) + + if show_class: + ax.text( + bbox[0], bbox[1] - 2, + get_class_string(classes[i], score, dataset), + fontsize=3, + family='serif', + bbox=dict( + facecolor='g', alpha=0.4, pad=0, edgecolor='none'), + color='white') + + # show mask + if segms is not None and len(segms) > i: + img = np.ones(im.shape) + color_mask = color_list[mask_color_id % len(color_list), 0:3] + mask_color_id += 1 + + w_ratio = .4 + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio + for c in range(3): + img[:, :, c] = color_mask[c] + e = masks[:, :, i] + + contour = cv2.findContours( + e.copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)[-2] + + for c in contour: + polygon = Polygon( + c.reshape((-1, 2)), + fill=True, facecolor=color_mask, + edgecolor='w', linewidth=1.2, + alpha=0.5) + ax.add_patch(polygon) + + # show keypoints + if keypoints is not None and len(keypoints) > i: + kps = keypoints[i] + plt.autoscale(False) + for l in range(len(kp_lines)): + i1 = kp_lines[l][0] + i2 = kp_lines[l][1] + if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh: + x = [kps[0, i1], kps[0, i2]] + y = [kps[1, i1], kps[1, i2]] + line = plt.plot(x, y) + plt.setp(line, color=colors[l], linewidth=1.0, alpha=0.7) + if kps[2, i1] > kp_thresh: + plt.plot( + kps[0, i1], kps[1, i1], '.', color=colors[l], + markersize=3.0, alpha=0.7) + + if kps[2, i2] > kp_thresh: + plt.plot( + kps[0, i2], kps[1, i2], '.', color=colors[l], + markersize=3.0, alpha=0.7) + + # add mid shoulder / mid hip for better visualization + mid_shoulder = ( + kps[:2, dataset_keypoints.index('right_shoulder')] + + kps[:2, dataset_keypoints.index('left_shoulder')]) / 2.0 + sc_mid_shoulder = np.minimum( + kps[2, dataset_keypoints.index('right_shoulder')], + kps[2, dataset_keypoints.index('left_shoulder')]) + mid_hip = ( + kps[:2, dataset_keypoints.index('right_hip')] + + kps[:2, dataset_keypoints.index('left_hip')]) / 2.0 + sc_mid_hip = np.minimum( + kps[2, dataset_keypoints.index('right_hip')], + kps[2, dataset_keypoints.index('left_hip')]) + if (sc_mid_shoulder > kp_thresh and + kps[2, dataset_keypoints.index('nose')] > kp_thresh): + x = [mid_shoulder[0], kps[0, dataset_keypoints.index('nose')]] + y = [mid_shoulder[1], kps[1, dataset_keypoints.index('nose')]] + line = plt.plot(x, y) + plt.setp( + line, color=colors[len(kp_lines)], linewidth=1.0, alpha=0.7) + if sc_mid_shoulder > kp_thresh and sc_mid_hip > kp_thresh: + x = [mid_shoulder[0], mid_hip[0]] + y = [mid_shoulder[1], mid_hip[1]] + line = plt.plot(x, y) + plt.setp( + line, color=colors[len(kp_lines) + 1], linewidth=1.0, + alpha=0.7) + + output_name = os.path.basename(im_name) + '.' + ext + fig.savefig(os.path.join(output_dir, '{}'.format(output_name)), dpi=dpi) + plt.close('all') diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..38cc0eaf6e2bbba79fef786dc7b35791688142bd --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,28 @@ +# Use Caffe2 image as parent image +FROM caffe2/caffe2:snapshot-py2-cuda9.0-cudnn7-ubuntu16.04 + +RUN mv /usr/local/caffe2 /usr/local/caffe2_build +ENV Caffe2_DIR /usr/local/caffe2_build + +ENV PYTHONPATH /usr/local/caffe2_build:${PYTHONPATH} +ENV LD_LIBRARY_PATH /usr/local/caffe2_build/lib:${LD_LIBRARY_PATH} + +# Clone the Detectron repository +RUN git clone https://github.com/facebookresearch/detectron /detectron + +# Install Python dependencies +RUN pip install -r /detectron/requirements.txt + +# Install the COCO API +RUN git clone https://github.com/cocodataset/cocoapi.git /cocoapi +WORKDIR /cocoapi/PythonAPI +RUN make install + +# Go to Detectron root +WORKDIR /detectron + +# Set up Python modules +RUN make + +# [Optional] Build custom ops +RUN make ops diff --git a/projects/GN/README.md b/projects/GN/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5757189a6f2971be4414570e33d53b9c3202e7f9 --- /dev/null +++ b/projects/GN/README.md @@ -0,0 +1,304 @@ +# Group Normalization for Mask R-CNN + +
+ +
+ +## Introduction + +This file provides Mask R-CNN baseline results and models trained with [Group Normalization](https://arxiv.org/abs/1803.08494): + +``` +@article{GroupNorm2018, + title={Group Normalization}, + author={Yuxin Wu and Kaiming He}, + journal={arXiv:1803.08494}, + year={2018} +} +``` + +**Note:** This code uses the GroupNorm op implemented in CUDA, included in the Caffe2 repo. When writing this document, Caffe2 is being merged into PyTorch, and the GroupNorm op is located [here](https://github.com/pytorch/pytorch/blob/master/caffe2/operators/group_norm_op.cu). Make sure your Caffe2 is up to date. + +## Pretrained Models with GN + +These models are trained in Caffe2 on the standard ImageNet-1k dataset, using GroupNorm with 32 groups (G=32). + +- [R-50-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47261647/R-50-GN.pkl): ResNet-50 with GN, 24.0\% top-1 error (center-crop). +- [R-101-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47592356/R-101-GN.pkl): ResNet-101 with GN, 22.6\% top-1 error (center-crop). + +## Results + +### Baselines with BN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
         case          typelr
schd
im/
gpu
train
mem
(GB)
train
time
(s/iter)
train
time
total
(hr)
inference
time
(s/im)
box
AP
mask
AP
model id
R-50-FPN, BN*Mask R-CNN2x28.60.89744.90.099 + 0.01838.634.535859007
R-101-FPN, BN*Mask R-CNN2x210.20.99349.70.126 + 0.01740.936.435861858
+ +**Notes:** + +- This table is copied from [Detectron Model Zoo](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#end-to-end-faster--mask-r-cnn-baselines). +- BN* means that BatchNorm (BN) is used for pre-training and is frozen and turned into a per-channel linear layer when fine-tuning. This is the default of Faster/Mask R-CNN and Detectron. + +### Mask R-CNN with GN + +#### Standard Mask R-CNN recipe + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
         case          typelr
schd
im/
gpu
train
mem
(GB)
train
time
(s/iter)
train
time
total
(hr)
inference
time
(s/im)
box
AP
mask
AP
model iddownload
links
R-50-FPN, GNMask R-CNN2x210.51.01750.80.146 + 0.01740.335.748616381 + model +  |  + boxes +  |  + masks
R-101-FPN, GNMask R-CNN2x212.41.15157.50.180 + 0.01541.836.848616724 + model +  |  + boxes +  |  + masks
+ +**Notes:** +- GN is applied on: (i) ResNet layers inherited from pre-training, (ii) the FPN-specific layers, (iii) the RoI bbox head, and (iv) the RoI mask head. +- These GN models use a 4conv+1fc RoI box head. The BN* counterpart with this head performs similarly with the default 2fc head: using this codebase, R-50-FPN BN\* with 4conv+1fc has 38.8/34.4 box/mask AP. +- 2x is the default schedule (180k) in Detectron. + +#### Longer training schedule + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
         case          typelr
schd
im/
gpu
train
mem
(GB)
train
time
(s/iter)
train
time
total
(hr)
inference
time
(s/im)
box
AP
mask
AP
model iddownload
links
R-50-FPN, GNMask R-CNN3x210.51.03377.40.145 + 0.01540.836.148734751 + model +  |  + boxes +  |  + masks
R-101-FPN, GNMask R-CNN3x212.41.17187.90.180 + 0.01442.337.248734779 + model +  |  + boxes +  |  + masks
+ +**Notes:** +- 3x is a longer schedule (270k). GN can improve further when using the longer schedule, but its BN* counterpart remains similar (R-50-FPN BN\*: 38.9/34.3) with the longer schedule. +- These models are **without** any scale augmentation that can further [improve results](https://github.com/facebookresearch/Detectron/blob/master/MODEL_ZOO.md#mask-r-cnn-with-bells--whistles). + + +### Explorations + +#### Training Mask R-CNN from scratch + +GN enables to train Mask R-CNN *from scratch* without ImageNet pre-training, despite the small batch size. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
         case          typelr
schd
im/
gpu
train
mem
(GB)
train
time
(s/iter)
train
time
total
(hr)
inference
time
(s/im)
box
AP
mask
AP
model id
R-50-FPN, GN, scratchMask R-CNN3x210.81.08781.50.140 + 0.01939.535.256421872
R-101-FPN, GN, scratchMask R-CNN3x212.71.24393.20.177 + 0.01941.036.456421911
+ +**Notes:** +- To reproduce these results, see the config yaml files starting with ```scratch ```. +- These are results using ```freeze_at=0```. See this [commit](https://github.com/facebookresearch/Detectron/commit/f8ffc87ca442d8f6bd2b9aad11029b5db56d7260) about the related issue. + +  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
R-50-FPN, GN, scratchMask R-CNN3x210.50.99074.30.146 + 0.02036.232.549025460
R-101-FPN, GN, scratchMask R-CNN3x212.41.12484.30.180 + 0.01937.533.349024951
+ +**Notes:** +- These are early results that followed the default training using ```freeze_at=2```. This means the layers of conv1 and res2 were simply random weights in the case of training from-scratch. See this [commit](https://github.com/facebookresearch/Detectron/commit/f8ffc87ca442d8f6bd2b9aad11029b5db56d7260) about the related issue. diff --git a/projects/GN/gn.jpg b/projects/GN/gn.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5f04015ab94ef3c8576f3f79e09544479458b961 Binary files /dev/null and b/projects/GN/gn.jpg differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..23a0ae0bf62dbf1f3be2b62964d4865ae4334dbc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +numpy>=1.13 +pyyaml==3.12 +matplotlib +opencv-python>=3.2 +setuptools +Cython +mock +scipy +six +future +protobuf diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..234e736c140575d81410e6d888cb2000cd840974 --- /dev/null +++ b/setup.py @@ -0,0 +1,61 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from Cython.Build import cythonize +from setuptools import Extension +from setuptools import setup + +import numpy as np + +_NP_INCLUDE_DIRS = np.get_include() + + +# Extension modules +ext_modules = [ + Extension( + name='detectron.utils.cython_bbox', + sources=[ + 'detectron/utils/cython_bbox.pyx' + ], + extra_compile_args=[ + '-Wno-cpp' + ], + include_dirs=[ + _NP_INCLUDE_DIRS + ] + ), + Extension( + name='detectron.utils.cython_nms', + sources=[ + 'detectron/utils/cython_nms.pyx' + ], + extra_compile_args=[ + '-Wno-cpp' + ], + include_dirs=[ + _NP_INCLUDE_DIRS + ] + ) +] + +setup( + name='Detectron', + packages=['detectron'], + ext_modules=cythonize(ext_modules) +) diff --git "a/\346\226\260\345\273\272\346\226\207\346\234\254\346\226\207\346\241\243.txt" "b/\346\226\260\345\273\272\346\226\207\346\234\254\346\226\207\346\241\243.txt" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391